diff --git a/Cargo.lock b/Cargo.lock
index b40e128bc676f..ceaaee00b5bad 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4648,7 +4648,7 @@ checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
 
 [[package]]
 name = "local_stats_alloc"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "workspace-hack",
 ]
@@ -6098,7 +6098,7 @@ dependencies = [
 
 [[package]]
 name = "pgwire"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "auto_enums",
@@ -7241,7 +7241,7 @@ dependencies = [
 
 [[package]]
 name = "risedev"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "chrono",
@@ -7270,7 +7270,7 @@ dependencies = [
 
 [[package]]
 name = "risedev-config"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "clap",
@@ -7283,7 +7283,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave-fields-derive"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "expect-test",
  "indoc",
@@ -7295,7 +7295,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_backup"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -7317,7 +7317,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_batch"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "assert_matches",
@@ -7363,7 +7363,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_bench"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "async-trait",
  "aws-config",
@@ -7397,7 +7397,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_cmd"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "clap",
  "madsim-tokio",
@@ -7418,7 +7418,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_cmd_all"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "clap",
@@ -7450,7 +7450,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_common"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "arc-swap",
@@ -7550,7 +7550,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_common_heap_profiling"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "chrono",
@@ -7565,7 +7565,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_common_proc_macro"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "bae",
  "proc-macro-error",
@@ -7576,7 +7576,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_common_service"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "async-trait",
  "futures",
@@ -7597,7 +7597,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_compaction_test"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -7624,7 +7624,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_compactor"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "async-trait",
  "await-tree",
@@ -7646,7 +7646,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_compute"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -7689,7 +7689,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_connector"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "apache-avro 0.16.0",
@@ -7790,7 +7790,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_ctl"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "bytes",
@@ -7825,7 +7825,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_e2e_extended_mode_test"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "chrono",
@@ -7840,7 +7840,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_error"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "bincode 1.3.3",
  "bytes",
@@ -7855,7 +7855,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_expr"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "arrow-array",
@@ -7892,7 +7892,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_expr_impl"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "aho-corasick",
  "anyhow",
@@ -7939,7 +7939,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_frontend"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "arc-swap",
@@ -8009,7 +8009,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_hummock_sdk"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "bytes",
  "easy-ext",
@@ -8024,7 +8024,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_hummock_test"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "async-trait",
  "bytes",
@@ -8056,7 +8056,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_hummock_trace"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "async-trait",
  "bincode 2.0.0-rc.3",
@@ -8120,7 +8120,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_mem_table_spill_test"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "async-trait",
  "bytes",
@@ -8136,7 +8136,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_meta"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "arc-swap",
@@ -8204,7 +8204,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_meta_model_migration"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "async-std",
  "sea-orm-migration",
@@ -8213,7 +8213,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_meta_model_v2"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "risingwave_pb",
  "sea-orm",
@@ -8223,7 +8223,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_meta_node"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "clap",
@@ -8253,7 +8253,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_meta_service"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -8278,7 +8278,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_object_store"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "async-trait",
  "await-tree",
@@ -8311,7 +8311,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_pb"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "enum-as-inner",
  "fs-err",
@@ -8331,7 +8331,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_planner_test"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "expect-test",
@@ -8353,7 +8353,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_regress_test"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "clap",
@@ -8367,7 +8367,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_rpc_client"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -8397,7 +8397,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_rt"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "await-tree",
  "console",
@@ -8476,7 +8476,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_source"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "assert_matches",
@@ -8498,7 +8498,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_sqlparser"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "itertools 0.12.0",
  "matches",
@@ -8525,7 +8525,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_sqlsmith"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "chrono",
@@ -8552,7 +8552,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_state_cleaning_test"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "clap",
@@ -8572,7 +8572,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_storage"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "arc-swap",
@@ -8587,6 +8587,7 @@ dependencies = [
  "dyn-clone",
  "either",
  "enum-as-inner",
+ "expect-test",
  "fail",
  "fiemap",
  "foyer",
@@ -8638,7 +8639,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_stream"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "anyhow",
  "assert_matches",
@@ -8698,7 +8699,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_test_runner"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "fail",
  "sync-point",
@@ -8725,7 +8726,7 @@ dependencies = [
 
 [[package]]
 name = "risingwave_variables"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "chrono",
  "workspace-hack",
@@ -11523,11 +11524,11 @@ dependencies = [
 
 [[package]]
 name = "with_options"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 
 [[package]]
 name = "workspace-config"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "log",
  "openssl-sys",
@@ -11538,7 +11539,7 @@ dependencies = [
 
 [[package]]
 name = "workspace-hack"
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 dependencies = [
  "ahash 0.8.3",
  "allocator-api2",
diff --git a/Cargo.toml b/Cargo.toml
index 543183b232263..b16f130797705 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -62,7 +62,7 @@ exclude = ["lints"]
 resolver = "2"
 
 [workspace.package]
-version = "1.3.0-alpha"
+version = "1.5.0-alpha"
 edition = "2021"
 homepage = "https://github.com/risingwavelabs/risingwave"
 keywords = ["sql", "database", "streaming"]
diff --git a/backwards-compat-tests/scripts/utils.sh b/backwards-compat-tests/scripts/utils.sh
index 07c06ecfdef38..dc7ea42a0c481 100644
--- a/backwards-compat-tests/scripts/utils.sh
+++ b/backwards-compat-tests/scripts/utils.sh
@@ -129,37 +129,47 @@ version_lt() {
 
 ################################### Entry Points
 
-# Get $OLD_VERSION and $NEW_VERSION for Risingwave
-get_rw_versions() {
-  # For backwards compat test we assume we are testing the latest version of RW (i.e. latest main commit)
-  # against the Nth latest release candidate, where N > 1. N can be larger,
-  # in case some old cluster did not upgrade.
-  local VERSION_OFFSET=4
-
-  # First we obtain a list of versions from git branch names.
-  # Then we normalize them to semver format (MAJOR.MINOR.PATCH).
-  echo "--- git branch origin output"
-  git branch -r | grep origin
-
-  # Extract X.Y.Z tags
-  echo "--- VERSION BRANCHES"
-  local tags=$(git tag | grep -E "^v[0-9]+\.[0-9]+\.[0-9]+$" | tr -d 'v' | tr -d ' ')
-  echo "$tags"
-
-  # Then we sort them in descending order.
-  echo "--- VERSIONS"
-  local sorted_versions=$(echo -e "$tags" | sort -t '.' -n)
-  echo "$sorted_versions"
-
-  # Then we take the Nth latest version.
-  # We set $OLD_VERSION to this.
-  OLD_VERSION=$(echo -e "$sorted_versions" | tail -n $VERSION_OFFSET | head -1)
+get_old_version() {
+   # For backwards compat test we assume we are testing the latest version of RW (i.e. latest main commit)
+   # against the Nth latest release candidate, where N > 1. N can be larger,
+   # in case some old cluster did not upgrade.
+   if [[ -z $VERSION_OFFSET ]]
+   then
+       local VERSION_OFFSET=1
+   fi
+
+   # First we obtain a list of versions from git branch names.
+   # Then we normalize them to semver format (MAJOR.MINOR.PATCH).
+   echo "--- git branch origin output"
+   git branch -r | grep origin
+
+   # Extract X.Y.Z tags
+   echo "--- VERSION BRANCHES"
+   local tags=$(git tag | grep -E "^v[0-9]+\.[0-9]+\.[0-9]+$" | tr -d 'v' | tr -d ' ')
+   echo "$tags"
+
+   # Then we sort them in descending order.
+   echo "--- VERSIONS"
+   local sorted_versions=$(echo -e "$tags" | sort -t '.' -n)
+   echo "$sorted_versions"
+
+   # Then we take the Nth latest version.
+   # We set $OLD_VERSION to this.
+   OLD_VERSION=$(echo -e "$sorted_versions" | tail -n $VERSION_OFFSET | head -1)
+}
 
+get_new_version() {
   # Next, for $NEW_VERSION we just scrape it from `workspace.package.version`.
   NEW_VERSION=$(cat Cargo.toml | grep "\[workspace\.package\]" -A 5 | sed -n 's/version = \"\([0-9]*\.[0-9]*\.[0-9]*\).*/\1/p' | tr -d ' ')
+}
+
+# Get $OLD_VERSION and $NEW_VERSION for Risingwave
+get_rw_versions() {
+  get_old_version
+  get_new_version
 
-  # Then we assert that `$OLD_VERSION` < `$NEW_VERSION`.
-  if version_lt "$OLD_VERSION" "$NEW_VERSION"
+  # Then we assert that `$OLD_VERSION` <= `$NEW_VERSION`.
+  if version_le "$OLD_VERSION" "$NEW_VERSION"
   then
     echo "OLD_VERSION: $OLD_VERSION"
     echo "NEW_VERSION: $NEW_VERSION"
diff --git a/ci/scripts/run-backfill-tests.sh b/ci/scripts/run-backfill-tests.sh
index b0010af68c640..dddf88e4b4cac 100755
--- a/ci/scripts/run-backfill-tests.sh
+++ b/ci/scripts/run-backfill-tests.sh
@@ -96,21 +96,20 @@ restart_cn() {
 test_snapshot_and_upstream_read() {
   echo "--- e2e, ci-backfill, test_snapshot_and_upstream_read"
   cargo make ci-start ci-backfill
-
-  run_sql_file "$PARENT_PATH"/sql/backfill/create_base_table.sql
+  run_sql_file "$PARENT_PATH"/sql/backfill/basic/create_base_table.sql
 
   # Provide snapshot
-  run_sql_file "$PARENT_PATH"/sql/backfill/insert.sql
+  run_sql_file "$PARENT_PATH"/sql/backfill/basic/insert.sql
 
   # Provide updates ...
-  run_sql_file "$PARENT_PATH"/sql/backfill/insert.sql &
+  run_sql_file "$PARENT_PATH"/sql/backfill/basic/insert.sql &
 
   # ... and concurrently create mv.
-  run_sql_file "$PARENT_PATH"/sql/backfill/create_mv.sql &
+  run_sql_file "$PARENT_PATH"/sql/backfill/basic/create_mv.sql &
 
   wait
 
-  run_sql_file "$PARENT_PATH"/sql/backfill/select.sql </dev/null
+  run_sql_file "$PARENT_PATH"/sql/backfill/basic/select.sql </dev/null
 
   cargo make kill
   cargo make wait-processes-exit
@@ -146,7 +145,29 @@ test_backfill_tombstone() {
   ./risedev psql -c "CREATE MATERIALIZED VIEW m1 as select * from tomb;"
   echo "--- Kill cluster"
   kill_cluster
+  cargo make wait-processes-exit
+  wait
+}
+
+test_replication_with_column_pruning() {
+  echo "--- e2e, test_replication_with_column_pruning"
+  cargo make ci-start ci-backfill
+  run_sql_file "$PARENT_PATH"/sql/backfill/replication_with_column_pruning/create_base_table.sql
+  # Provide snapshot
+  run_sql_file "$PARENT_PATH"/sql/backfill/replication_with_column_pruning/insert.sql
+
+  run_sql_file "$PARENT_PATH"/sql/backfill/replication_with_column_pruning/create_mv.sql &
+
+  # Provide upstream updates
+  run_sql_file "$PARENT_PATH"/sql/backfill/replication_with_column_pruning/insert.sql &
+
   wait
+
+  run_sql_file "$PARENT_PATH"/sql/backfill/replication_with_column_pruning/select.sql </dev/null
+  run_sql_file "$PARENT_PATH"/sql/backfill/replication_with_column_pruning/drop.sql
+  echo "--- Kill cluster"
+  cargo make kill
+  cargo make wait-processes-exit
 }
 
 # Test sink backfill recovery
@@ -184,6 +205,7 @@ main() {
   set -euo pipefail
   test_snapshot_and_upstream_read
   test_backfill_tombstone
+  test_replication_with_column_pruning
   test_sink_backfill_recovery
 }
 
diff --git a/ci/scripts/sql/backfill/create_base_table.sql b/ci/scripts/sql/backfill/basic/create_base_table.sql
similarity index 100%
rename from ci/scripts/sql/backfill/create_base_table.sql
rename to ci/scripts/sql/backfill/basic/create_base_table.sql
diff --git a/ci/scripts/sql/backfill/create_mv.sql b/ci/scripts/sql/backfill/basic/create_mv.sql
similarity index 100%
rename from ci/scripts/sql/backfill/create_mv.sql
rename to ci/scripts/sql/backfill/basic/create_mv.sql
diff --git a/ci/scripts/sql/backfill/basic/drop.sql b/ci/scripts/sql/backfill/basic/drop.sql
new file mode 100644
index 0000000000000..304929c7d7ee0
--- /dev/null
+++ b/ci/scripts/sql/backfill/basic/drop.sql
@@ -0,0 +1,2 @@
+DROP MATERIALIZED VIEW mv1;
+DROP TABLE t1;
\ No newline at end of file
diff --git a/ci/scripts/sql/backfill/insert.sql b/ci/scripts/sql/backfill/basic/insert.sql
similarity index 100%
rename from ci/scripts/sql/backfill/insert.sql
rename to ci/scripts/sql/backfill/basic/insert.sql
diff --git a/ci/scripts/sql/backfill/select.sql b/ci/scripts/sql/backfill/basic/select.sql
similarity index 100%
rename from ci/scripts/sql/backfill/select.sql
rename to ci/scripts/sql/backfill/basic/select.sql
diff --git a/ci/scripts/sql/backfill/replication_with_column_pruning/create_base_table.sql b/ci/scripts/sql/backfill/replication_with_column_pruning/create_base_table.sql
new file mode 100644
index 0000000000000..3025d0153a9cc
--- /dev/null
+++ b/ci/scripts/sql/backfill/replication_with_column_pruning/create_base_table.sql
@@ -0,0 +1,2 @@
+SET STREAMING_ENABLE_ARRANGEMENT_BACKFILL=true;
+create table t1 (x int, y int, z int);
\ No newline at end of file
diff --git a/ci/scripts/sql/backfill/replication_with_column_pruning/create_mv.sql b/ci/scripts/sql/backfill/replication_with_column_pruning/create_mv.sql
new file mode 100644
index 0000000000000..5f706f091cd8e
--- /dev/null
+++ b/ci/scripts/sql/backfill/replication_with_column_pruning/create_mv.sql
@@ -0,0 +1 @@
+create materialized view mv1 as select y, x from t1;
diff --git a/ci/scripts/sql/backfill/replication_with_column_pruning/drop.sql b/ci/scripts/sql/backfill/replication_with_column_pruning/drop.sql
new file mode 100644
index 0000000000000..304929c7d7ee0
--- /dev/null
+++ b/ci/scripts/sql/backfill/replication_with_column_pruning/drop.sql
@@ -0,0 +1,2 @@
+DROP MATERIALIZED VIEW mv1;
+DROP TABLE t1;
\ No newline at end of file
diff --git a/ci/scripts/sql/backfill/replication_with_column_pruning/insert.sql b/ci/scripts/sql/backfill/replication_with_column_pruning/insert.sql
new file mode 100644
index 0000000000000..19ac1e4a03971
--- /dev/null
+++ b/ci/scripts/sql/backfill/replication_with_column_pruning/insert.sql
@@ -0,0 +1,2 @@
+insert into t1 select generate_series + 1, generate_series + 2, generate_series + 3 from generate_series(1, 100000);
+flush;
diff --git a/ci/scripts/sql/backfill/replication_with_column_pruning/select.sql b/ci/scripts/sql/backfill/replication_with_column_pruning/select.sql
new file mode 100644
index 0000000000000..b86731339486f
--- /dev/null
+++ b/ci/scripts/sql/backfill/replication_with_column_pruning/select.sql
@@ -0,0 +1 @@
+SELECT * FROM mv1 ORDER BY x LIMIT 10;
\ No newline at end of file
diff --git a/ci/workflows/main-cron.yml b/ci/workflows/main-cron.yml
index 8a6b3b823fb7b..02720d6480263 100644
--- a/ci/workflows/main-cron.yml
+++ b/ci/workflows/main-cron.yml
@@ -575,9 +575,9 @@ steps:
     timeout_in_minutes: 5
 
   # Backwards compatibility tests
-  - label: "Backwards compatibility tests"
+  - label: "Backwards compatibility tests version_offset={{matrix.version_offset}}"
     key: "backwards-compat-tests"
-    command: "RW_COMMIT=$BUILDKITE_COMMIT ci/scripts/backwards-compat-test.sh -p ci-release"
+    command: "VERSION_OFFSET={{matrix.version_offset}} RW_COMMIT=$BUILDKITE_COMMIT ci/scripts/backwards-compat-test.sh -p ci-release"
     if: |
       !(build.pull_request.labels includes "ci/main-cron/skip-ci") && build.env("CI_STEPS") == null
       || build.pull_request.labels includes "ci/run-backwards-compat-tests"
@@ -590,6 +590,21 @@ steps:
           config: ci/docker-compose.yml
           mount-buildkite-agent: true
       - ./ci/plugins/upload-failure-logs
+    matrix:
+      setup:
+        # Test the 4 latest versions against the latest main.
+        # e.g.
+        # 1: 2.0.0
+        # 2: 1.1.1
+        # 3: 1.0.1
+        # 4: 1.0.0
+        # It is ordered by the full version number, rather than minor / major version.
+        # We can change to just be on major version in the future.
+        version_offset:
+          - "1"
+          - "2"
+          - "3"
+          - "4"
     timeout_in_minutes: 30
     retry: *auto-retry
 
diff --git a/ci/workflows/pull-request.yml b/ci/workflows/pull-request.yml
index 144e60ee3f179..273507a6cdfb0 100644
--- a/ci/workflows/pull-request.yml
+++ b/ci/workflows/pull-request.yml
@@ -518,7 +518,7 @@ steps:
 
   # Backwards compatibility tests
   - label: "Backwards compatibility tests"
-    command: "RW_COMMIT=$BUILDKITE_COMMIT ci/scripts/backwards-compat-test.sh -p ci-dev"
+    command: "VERSION_OFFSET={{matrix.version_offset}} RW_COMMIT=$BUILDKITE_COMMIT ci/scripts/backwards-compat-test.sh -p ci-dev"
     if: |
       build.pull_request.labels includes "breaking-change" ||
         build.pull_request.labels includes "ci/run-backwards-compat-tests" ||
@@ -531,6 +531,21 @@ steps:
           config: ci/docker-compose.yml
           mount-buildkite-agent: true
       - ./ci/plugins/upload-failure-logs
+    matrix:
+      setup:
+        # Test the 4 latest versions against the latest main.
+        # e.g.
+        # 1: 2.0.0
+        # 2: 1.1.1
+        # 3: 1.0.1
+        # 4: 1.0.0
+        # It is ordered by the full version number, rather than minor / major version.
+        # We can change to just be on major version in the future.
+        version_offset:
+          - "1"
+          - "2"
+          - "3"
+          - "4"
     timeout_in_minutes: 25
 
   # Sqlsmith differential testing
diff --git a/docker/docker-compose-with-local-fs.yml b/docker/docker-compose-with-local-fs.yml
new file mode 100644
index 0000000000000..9ba3b3d7326cd
--- /dev/null
+++ b/docker/docker-compose-with-local-fs.yml
@@ -0,0 +1,102 @@
+---
+version: "3"
+x-image: &image
+  image: ${RW_IMAGE:-risingwavelabs/risingwave:nightly-20231211}
+services:
+  risingwave-standalone:
+    <<: *image
+    command: "standalone --meta-opts=\" \
+                    --listen-addr 0.0.0.0:5690 \
+                    --advertise-addr 0.0.0.0:5690 \
+                    --dashboard-host 0.0.0.0:5691 \
+                    --prometheus-host 0.0.0.0:1250 \
+                    --connector-rpc-endpoint 0.0.0.0:50051 \
+                    --backend etcd \
+                    --etcd-endpoints etcd-0:2388 \
+                    --state-store hummock+fs://<local-path> \
+                    --data-directory hummock_001 \
+                    --config-path /risingwave.toml\" \
+                 --compute-opts=\" \
+                    --config-path /risingwave.toml \
+                    --listen-addr 0.0.0.0:5688 \
+                    --prometheus-listener-addr 0.0.0.0:1222 \
+                    --advertise-addr 0.0.0.0:5688 \
+                    --async-stack-trace verbose \
+                    --connector-rpc-endpoint 0.0.0.0:50051 \
+                    # --parallelism 4 \
+                    --role both \
+                    --meta-address http://0.0.0.0:5690\" \
+                 --frontend-opts=\" \
+                   --config-path /risingwave.toml \
+                   --listen-addr 0.0.0.0:4566 \
+                   --advertise-addr 0.0.0.0:4566 \
+                   --prometheus-listener-addr 0.0.0.0:2222 \
+                   --health-check-listener-addr 0.0.0.0:6786 \
+                   --meta-addr http://0.0.0.0:5690\" \
+                 --compactor-opts=\" \
+                   --listen-addr 0.0.0.0:6660 \
+                   --prometheus-listener-addr 0.0.0.0:1260 \
+                   --advertise-addr 0.0.0.0:6660 \
+                   --meta-address http://0.0.0.0:5690\""
+    expose:
+      - "6660"
+      - "1260"
+      - "4566"
+      - "5688"
+      - "1222"
+      - "5690"
+      - "1250"
+      - "5691"
+      - "2222"
+    ports:
+      - "4566:4566"
+      - "5690:5690"
+      - "5691:5691"
+      - "1222:1222"
+      - "1250:1250"
+      - "1260:1260"
+      - "2222:2222"
+    depends_on:
+      - etcd-0
+    volumes:
+      - "./risingwave.toml:/risingwave.toml"
+    environment:
+      RUST_BACKTRACE: "1"
+      # If ENABLE_TELEMETRY is not set, telemetry will start by default
+      ENABLE_TELEMETRY: ${ENABLE_TELEMETRY:-true}
+    container_name: risingwave-standalone
+    healthcheck:
+      test:
+        - CMD-SHELL
+        - bash -c 'printf \"GET / HTTP/1.1\n\n\" > /dev/tcp/127.0.0.1/6660; exit $$?;'
+        - bash -c 'printf \"GET / HTTP/1.1\n\n\" > /dev/tcp/127.0.0.1/5688; exit $$?;'
+        - bash -c 'printf \"GET / HTTP/1.1\n\n\" > /dev/tcp/127.0.0.1/4566; exit $$?;'
+        - bash -c 'printf \"GET / HTTP/1.1\n\n\" > /dev/tcp/127.0.0.1/5690; exit $$?;'
+      interval: 1s
+      timeout: 5s
+    restart: always
+    deploy:
+      resources:
+        limits:
+          memory: <config-the-allocated-memory>
+        reservations:
+          memory: <config-the-allocated-memory>
+  etcd-0:
+    extends:
+      file: docker-compose.yml
+      service: etcd-0
+  grafana-0:
+    extends:
+      file: docker-compose.yml
+      service: grafana-0
+  prometheus-0:
+    extends:
+      file: docker-compose.yml
+      service: prometheus-0
+volumes:
+  etcd-0:
+    external: false
+  grafana-0:
+    external: false
+  prometheus-0:
+    external: false
diff --git a/integration_tests/big-query-sink/README.md b/integration_tests/big-query-sink/README.md
index 1f06d3dfe1172..78c20a9866904 100644
--- a/integration_tests/big-query-sink/README.md
+++ b/integration_tests/big-query-sink/README.md
@@ -23,9 +23,9 @@ CREATE table '${project_id}'.'${dataset_id}'.'${table_id}'(
 
 4. Execute the SQL queries in sequence:
 
-- append-only/create_source.sql
-- append-only/create_mv.sql
-- append-only/create_sink.sql
+- create_source.sql
+- create_mv.sql
+- create_sink.sql
 
     1. We need to obtain the JSON file for Google Cloud service accounts, which can be configured here: https://console.cloud.google.com/iam-admin/serviceaccounts.
     2. Because BigQuery has limited support for updates and deletes, we currently only support 'append only'
diff --git a/integration_tests/big-query-sink/append-only-sql/create_mv.sql b/integration_tests/big-query-sink/create_mv.sql
similarity index 100%
rename from integration_tests/big-query-sink/append-only-sql/create_mv.sql
rename to integration_tests/big-query-sink/create_mv.sql
diff --git a/integration_tests/big-query-sink/append-only-sql/create_sink.sql b/integration_tests/big-query-sink/create_sink.sql
similarity index 100%
rename from integration_tests/big-query-sink/append-only-sql/create_sink.sql
rename to integration_tests/big-query-sink/create_sink.sql
diff --git a/integration_tests/big-query-sink/append-only-sql/create_source.sql b/integration_tests/big-query-sink/create_source.sql
similarity index 100%
rename from integration_tests/big-query-sink/append-only-sql/create_source.sql
rename to integration_tests/big-query-sink/create_source.sql
diff --git a/integration_tests/big-query-sink/prepare.sh b/integration_tests/big-query-sink/prepare.sh
new file mode 100755
index 0000000000000..ca9cc3284939d
--- /dev/null
+++ b/integration_tests/big-query-sink/prepare.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# set gcloud
+docker compose exec gcloud-cli gcloud auth login --cred-file=/gcp-rwctest.json
+
+docker compose exec gcloud-cli gcloud config set project rwctest
+
+bq_prepare_file='bq_prepare.sql'
+bq_prepare_content=$(cat $bq_prepare_file)
+
+docker compose exec gcloud-cli bq query --use_legacy_sql=false "$bq_prepare_content"
+
+sleep 10
diff --git a/integration_tests/big-query-sink/sink_check b/integration_tests/big-query-sink/sink_check
deleted file mode 100644
index 14cfded736c5e..0000000000000
--- a/integration_tests/big-query-sink/sink_check
+++ /dev/null
@@ -1 +0,0 @@
-rwctest.bqtest.bq_sink
diff --git a/integration_tests/big-query-sink/sink_check.py b/integration_tests/big-query-sink/sink_check.py
new file mode 100644
index 0000000000000..7388c853c11cd
--- /dev/null
+++ b/integration_tests/big-query-sink/sink_check.py
@@ -0,0 +1,25 @@
+import json
+import subprocess
+import sys
+
+relations = ['rwctest.bqtest.bq_sink']
+
+failed_cases = []
+for rel in relations:
+    sql = f"SELECT COUNT(*) AS count FROM `{rel}`"
+    print(f"run sql: {sql} on Bigquery")
+    rows = subprocess.check_output(
+        ["docker", "compose", "exec", "gcloud-cli", "bq", "query", "--use_legacy_sql=false", "--format=json", sql],
+    )
+    rows = int(json.loads(rows.decode("utf-8").strip())[0]['count'])
+    print(f"{rows} rows in {rel}")
+    if rows < 1:
+        failed_cases.append(rel)
+
+    drop_sql = f"DROP TABLE IF EXISTS `{rel}`"
+    subprocess.run(["docker", "compose", "exec", "gcloud-cli", "bq", "query", "--use_legacy_sql=false", drop_sql],
+                   check=True)
+
+if len(failed_cases) != 0:
+    print(f"Data check failed for case {failed_cases}")
+    sys.exit(1)
diff --git a/integration_tests/cassandra-and-scylladb-sink/prepare.sh b/integration_tests/cassandra-and-scylladb-sink/prepare.sh
new file mode 100755
index 0000000000000..690537d878208
--- /dev/null
+++ b/integration_tests/cassandra-and-scylladb-sink/prepare.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# wait for cassandra and scylladb to start up
+sleep 60
+
+# setup cassandra
+docker compose exec cassandra cqlsh -f prepare_cassandra_and_scylladb.sql
+
+# setup scylladb
+docker compose exec scylladb cqlsh -f prepare_cassandra_and_scylladb.sql
diff --git a/integration_tests/cassandra-and-scylladb-sink/sink_check b/integration_tests/cassandra-and-scylladb-sink/sink_check
deleted file mode 100644
index 49a88f8df2245..0000000000000
--- a/integration_tests/cassandra-and-scylladb-sink/sink_check
+++ /dev/null
@@ -1 +0,0 @@
-demo.demo_bhv_table
diff --git a/integration_tests/cassandra-and-scylladb-sink/sink_check.py b/integration_tests/cassandra-and-scylladb-sink/sink_check.py
new file mode 100644
index 0000000000000..2087e002d9f44
--- /dev/null
+++ b/integration_tests/cassandra-and-scylladb-sink/sink_check.py
@@ -0,0 +1,40 @@
+import subprocess
+import sys
+from time import sleep
+
+sleep(30)
+
+relations = ['demo.demo_bhv_table']
+
+dbs = ['cassandra', 'scylladb']
+failed_cases = []
+for rel in relations:
+    sql = f'select count(*) from {rel};'
+    for db in dbs:
+        print(f"Running SQL: {sql} on {db}")
+        query_output_file_name = f"query_{db}_output.txt"
+        query_output_file = open(query_output_file_name, "wb+")
+
+        subprocess.run(["docker", "compose", "exec", db, "cqlsh", "-e", sql], check=True,
+                       stdout=query_output_file)
+
+        # output file:
+        #
+        #  count
+        # -------
+        #   1000
+        #
+        # (1 rows)
+        query_output_file.seek(0)
+        lines = query_output_file.readlines()
+        query_output_file.close()
+        assert len(lines) >= 6
+        assert lines[1].decode('utf-8').strip().lower() == 'count'
+        rows = int(lines[3].decode('utf-8').strip())
+        print(f"{rows} rows in {db}.{rel}")
+        if rows < 1:
+            failed_cases.append(db + "_" + rel)
+
+if len(failed_cases) != 0:
+    print(f"Data check failed for case {failed_cases}")
+    sys.exit(1)
diff --git a/integration_tests/clickhouse-sink/README.md b/integration_tests/clickhouse-sink/README.md
index a383f3fba5ee4..efcf995fb3df8 100644
--- a/integration_tests/clickhouse-sink/README.md
+++ b/integration_tests/clickhouse-sink/README.md
@@ -14,7 +14,7 @@ The cluster contains a RisingWave cluster and its necessary dependencies, a data
 2. Create the ClickHouse table:
 
 ```sh
-docker compose exec clickhouse-server bash /opt/clickhouse/clickhouse-sql/run-sql-file.sh create_clickhouse_table
+./prepare.sh
 ```
 
 3. Execute the SQL queries in sequence:
@@ -28,8 +28,7 @@ We only support `upsert` with clickhouse' `CollapsingMergeTree` and `VersionedCo
 4. Execute a simple query:
 
 ```sh
-docker compose exec clickhouse-server bash /opt/clickhouse/clickhouse-sql/run-sql-file.sh clickhouse_query
-
+docker compose exec clickhouse-server clickhouse-client
 ```
 
 ```sql
diff --git a/integration_tests/clickhouse-sink/clickhouse-sql/clickhouse_query.sql b/integration_tests/clickhouse-sink/clickhouse-sql/clickhouse_query.sql
deleted file mode 100644
index a349770369552..0000000000000
--- a/integration_tests/clickhouse-sink/clickhouse-sql/clickhouse_query.sql
+++ /dev/null
@@ -1 +0,0 @@
-select user_id, count(*) from default.demo_test group by user_id limit 10
diff --git a/integration_tests/clickhouse-sink/clickhouse-sql/run-sql-file.sh b/integration_tests/clickhouse-sink/clickhouse-sql/run-sql-file.sh
deleted file mode 100644
index a122d09dcd424..0000000000000
--- a/integration_tests/clickhouse-sink/clickhouse-sql/run-sql-file.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-set -ex
-
-clickhouse-client < /opt/clickhouse/clickhouse-sql/$1.sql
\ No newline at end of file
diff --git a/integration_tests/clickhouse-sink/clickhouse-sql/create_clickhouse_table.sql b/integration_tests/clickhouse-sink/clickhouse_prepare.sql
similarity index 100%
rename from integration_tests/clickhouse-sink/clickhouse-sql/create_clickhouse_table.sql
rename to integration_tests/clickhouse-sink/clickhouse_prepare.sql
diff --git a/integration_tests/clickhouse-sink/docker-compose.yml b/integration_tests/clickhouse-sink/docker-compose.yml
index 8129c7d618daf..76b0f7fe607f5 100644
--- a/integration_tests/clickhouse-sink/docker-compose.yml
+++ b/integration_tests/clickhouse-sink/docker-compose.yml
@@ -12,7 +12,7 @@ services:
     expose:
       - 9009
     volumes:
-      - ./clickhouse-sql:/opt/clickhouse/clickhouse-sql
+      - ./clickhouse_prepare.sql:/clickhouse_prepare.sql
   risingwave-standalone:
     extends:
       file: ../../docker/docker-compose.yml
@@ -33,10 +33,6 @@ services:
     extends:
       file: ../../docker/docker-compose.yml
       service: prometheus-0
-  message_queue:
-    extends:
-      file: ../../docker/docker-compose.yml
-      service: message_queue
 volumes:
   risingwave-standalone:
     external: false
diff --git a/integration_tests/clickhouse-sink/prepare.sh b/integration_tests/clickhouse-sink/prepare.sh
new file mode 100755
index 0000000000000..cb8ec629b254e
--- /dev/null
+++ b/integration_tests/clickhouse-sink/prepare.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# setup clickhouse
+docker compose exec clickhouse-server bash -c "clickhouse-client < /clickhouse_prepare.sql"
diff --git a/integration_tests/clickhouse-sink/sink_check.py b/integration_tests/clickhouse-sink/sink_check.py
new file mode 100644
index 0000000000000..bb18e7e93ddd7
--- /dev/null
+++ b/integration_tests/clickhouse-sink/sink_check.py
@@ -0,0 +1,22 @@
+import subprocess
+import sys
+from time import sleep
+
+sleep(30)
+
+relations = ['default.demo_test']
+
+failed_cases = []
+for rel in relations:
+    sql = f"SELECT COUNT(*) FROM {rel};"
+    print(f"Running SQL: {sql} ON ClickHouse")
+    command = f'clickhouse-client -q "{sql}"'
+    rows = subprocess.check_output(["docker", "compose", "exec", "clickhouse-server", "bash", "-c", command])
+    rows = int(rows.decode('utf-8').strip())
+    print(f"{rows} rows in {rel}")
+    if rows < 1:
+        failed_cases.append(rel)
+
+if len(failed_cases) != 0:
+    print(f"Data check failed for case {failed_cases}")
+    sys.exit(1)
diff --git a/integration_tests/cockroach-sink/README.md b/integration_tests/cockroach-sink/README.md
index 5792c08021be8..c7ef841ea475b 100644
--- a/integration_tests/cockroach-sink/README.md
+++ b/integration_tests/cockroach-sink/README.md
@@ -5,9 +5,8 @@ This demo showcases how to sink RisingWave's data to an external CockroachDB. A
 During CI, the integration test will:
 
 1. Run `docker compose up -d` and start the cluster.
-2. After 20-30s, run `create_source.sql`.
-3. After 10s, run `create_mv.sql`.
-4. After another 10s, the tester will check if the ingestion is successful by creating a materialized view upon the source. It also checks if the MV created in the 3rd step has persisted the data.
+2. After 20-30s, run `create_source.sql`, `create_mv.sql`, `create_sink.sql`
+3. After another 30s, the tester will check if the ingestion is successful by `SELECT COUNT(*) FROM target_count;` in CockroachDB.
 
 To connect to the Postgres outside the container via psql:
 
diff --git a/integration_tests/cockroach-sink/postgres_prepare.sql b/integration_tests/cockroach-sink/cockroach_prepare.sql
similarity index 100%
rename from integration_tests/cockroach-sink/postgres_prepare.sql
rename to integration_tests/cockroach-sink/cockroach_prepare.sql
diff --git a/integration_tests/cockroach-sink/create_mv.sql b/integration_tests/cockroach-sink/create_mv.sql
index 29fdfa5cfdc4c..2cba41795922f 100644
--- a/integration_tests/cockroach-sink/create_mv.sql
+++ b/integration_tests/cockroach-sink/create_mv.sql
@@ -6,13 +6,3 @@ FROM
     user_behaviors
 GROUP BY
     target_id;
-
-CREATE SINK target_count_postgres_sink
-FROM
-    target_count WITH (
-        connector = 'jdbc',
-        jdbc.url = 'jdbc:postgresql://cockroachdb:26257/defaultdb?user=root',
-        table.name = 'target_count',
-        type = 'upsert',
-        primary_key = 'target_id'
-    );
diff --git a/integration_tests/cockroach-sink/create_sink.sql b/integration_tests/cockroach-sink/create_sink.sql
new file mode 100644
index 0000000000000..87c767f3dc3bd
--- /dev/null
+++ b/integration_tests/cockroach-sink/create_sink.sql
@@ -0,0 +1,20 @@
+CREATE SINK target_count_postgres_sink
+FROM
+    target_count WITH (
+        connector = 'jdbc',
+        jdbc.url = 'jdbc:postgresql://cockroachdb:26257/defaultdb?user=root',
+        table.name = 'target_count',
+        type = 'upsert',
+        primary_key = 'target_id'
+    );
+
+-- sink data_type table to pg
+CREATE SINK data_types_postgres_sink
+FROM
+    data_types WITH (
+    connector = 'jdbc',
+        jdbc.url = 'jdbc:postgresql://cockroachdb:26257/defaultdb?user=root',
+    table.name = 'data_types',
+    type='upsert',
+    primary_key = 'id'
+);
diff --git a/integration_tests/cockroach-sink/create_source.sql b/integration_tests/cockroach-sink/create_source.sql
index b37504e75dcce..68308df89ce9b 100644
--- a/integration_tests/cockroach-sink/create_source.sql
+++ b/integration_tests/cockroach-sink/create_source.sql
@@ -1,4 +1,4 @@
-CREATE SOURCE user_behaviors (
+CREATE TABLE user_behaviors (
     user_id VARCHAR,
     target_id VARCHAR,
     target_type VARCHAR,
@@ -7,10 +7,11 @@ CREATE SOURCE user_behaviors (
     parent_target_type VARCHAR,
     parent_target_id VARCHAR
 ) WITH (
-    connector = 'kafka',
-    topic = 'user_behaviors',
-    properties.bootstrap.server = 'message_queue:29092',
-    scan.startup.mode = 'earliest'
+    connector = 'datagen',
+    fields.user_id.kind = 'sequence',
+    fields.user_id.start = 1,
+    fields.user_id.end = 100,
+    datagen.rows.per.second = '100'
 ) FORMAT PLAIN ENCODE JSON;
 
 CREATE TABLE data_types (
@@ -34,17 +35,6 @@ CREATE TABLE data_types (
     array_column VARCHAR[]
 );
 
--- sink data_type table to pg
-CREATE SINK data_types_postgres_sink
-FROM
-    data_types WITH (
-    connector = 'jdbc',
-        jdbc.url = 'jdbc:postgresql://cockroachdb:26257/defaultdb?user=root',
-    table.name = 'data_types',
-    type='upsert',
-    primary_key = 'id'
-);
-
 INSERT INTO data_types (id, varchar_column, text_column, integer_column, smallint_column, bigint_column, decimal_column, real_column, double_column, boolean_column, date_column, time_column, timestamp_column, timestamptz_column, interval_column, jsonb_column, bytea_column, array_column)
 VALUES
     (1, 'Varchar value 1', 'Text value 1', 123, 456, 789, 12.34, 56.78, 90.12, TRUE, '2023-05-22', '12:34:56', '2023-05-22 12:34:56', '2023-05-22 12:34:56+00:00', '1 day', '{"key": "value"}', E'\\xDEADBEEF', ARRAY['Value 1', 'Value 2']),
@@ -52,4 +42,3 @@ VALUES
     (3, 'Varchar value 3', 'Text value 3', 345, 678, 901, 34.56, 78.90, 12.34, TRUE, '2023-05-24', '12:34:56', '2023-05-24 12:34:56', '2023-05-24 12:34:56+00:00', '3 days', '{"key": "value3"}', E'\\xCAFEBABE', ARRAY['Value 5', 'Value 6']),
     (4, 'Varchar value 4', 'Text value 4', 456, 789, 012, 45.67, 89.01, 23.45, FALSE, '2023-05-25', '23:45:01', '2023-05-25 23:45:01', '2023-05-25 23:45:01+00:00', '4 days', '{"key": "value4"}', E'\\xBABEC0DE', ARRAY['Value 7', 'Value 8']),
     (5, 'Varchar value 5', 'Text value 5', 567, 890, 123, 56.78, 90.12, 34.56, TRUE, '2023-05-26', '12:34:56', '2023-05-26 12:34:56', '2023-05-26 12:34:56+00:00', '5 days', '{"key": "value5"}', E'\\xDEADBABE', ARRAY['Value 9', 'Value 10']);
-
diff --git a/integration_tests/cockroach-sink/data_check b/integration_tests/cockroach-sink/data_check
deleted file mode 100644
index 3835eb979b86e..0000000000000
--- a/integration_tests/cockroach-sink/data_check
+++ /dev/null
@@ -1 +0,0 @@
-user_behaviors,target_count
\ No newline at end of file
diff --git a/integration_tests/cockroach-sink/docker-compose.yml b/integration_tests/cockroach-sink/docker-compose.yml
index cde3ef8742815..a205dca9e19cf 100644
--- a/integration_tests/cockroach-sink/docker-compose.yml
+++ b/integration_tests/cockroach-sink/docker-compose.yml
@@ -21,19 +21,6 @@ services:
     extends:
       file: ../../docker/docker-compose.yml
       service: prometheus-0
-  message_queue:
-    extends:
-      file: ../../docker/docker-compose.yml
-      service: message_queue
-  datagen:
-    build: ../datagen
-    depends_on: [message_queue]
-    command:
-      - /bin/sh
-      - -c
-      - /datagen --mode clickstream --qps 2 kafka --brokers message_queue:29092
-    restart: always
-    container_name: datagen
   cockroachdb:
     image: cockroachdb/cockroach:v23.1.11
     command: start-single-node --insecure
@@ -42,17 +29,11 @@ services:
       - "8080:8080" # CockroachDB Web UI port
     restart: always
     container_name: cockroachdb
-  prepare_postgres:
-    image: postgres
-    depends_on:
-      - cockroachdb
-    command:
-      - /bin/sh
-      - -c
-      - "psql postgresql://root@cockroachdb:26257/defaultdb < postgres_prepare.sql"
+  postgres:
+    image: postgres:latest
+    command: tail -f /dev/null
     volumes:
-      - "./postgres_prepare.sql:/postgres_prepare.sql"
-    container_name: prepare_postgres
+      - "./cockroach_prepare.sql:/cockroach_prepare.sql"
     restart: on-failure
 volumes:
   risingwave-standalone:
diff --git a/integration_tests/cockroach-sink/prepare.sh b/integration_tests/cockroach-sink/prepare.sh
new file mode 100755
index 0000000000000..bf9c0e8103d45
--- /dev/null
+++ b/integration_tests/cockroach-sink/prepare.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# setup cockroach
+docker compose exec postgres bash -c "psql postgresql://root@cockroachdb:26257/defaultdb < cockroach_prepare.sql"
diff --git a/integration_tests/cockroach-sink/query.sql b/integration_tests/cockroach-sink/query.sql
deleted file mode 100644
index e09c66a255f10..0000000000000
--- a/integration_tests/cockroach-sink/query.sql
+++ /dev/null
@@ -1,6 +0,0 @@
-SELECT
-    *
-FROM
-    target_count
-LIMIT
-    10;
\ No newline at end of file
diff --git a/integration_tests/cockroach-sink/sink_check.py b/integration_tests/cockroach-sink/sink_check.py
new file mode 100644
index 0000000000000..41c6c34e7da39
--- /dev/null
+++ b/integration_tests/cockroach-sink/sink_check.py
@@ -0,0 +1,20 @@
+import subprocess
+import sys
+
+relations = ['target_count', 'data_types']
+
+failed_cases = []
+for rel in relations:
+    sql = f'SELECT COUNT(*) FROM {rel};'
+    print(f"Running SQL: {sql} ON cockroach")
+    command = f'psql -U root -h cockroachdb -p 26257 -d defaultdb --tuples-only -c "{sql}"'
+    rows = subprocess.check_output(
+        ["docker", "compose", "exec", "postgres", "bash", "-c", command])
+    rows = int(rows.decode('utf-8').strip())
+    print(f"{rows} rows in {rel}")
+    if rows < 1:
+        failed_cases.append(rel)
+
+if len(failed_cases) != 0:
+    print(f"Data check failed for case {failed_cases}")
+    sys.exit(1)
diff --git a/integration_tests/elasticsearch-sink/sink_check b/integration_tests/elasticsearch-sink/sink_check
deleted file mode 100644
index 9daeafb9864cf..0000000000000
--- a/integration_tests/elasticsearch-sink/sink_check
+++ /dev/null
@@ -1 +0,0 @@
-test
diff --git a/integration_tests/elasticsearch-sink/sink_check.py b/integration_tests/elasticsearch-sink/sink_check.py
new file mode 100644
index 0000000000000..0e6ad8eda4da4
--- /dev/null
+++ b/integration_tests/elasticsearch-sink/sink_check.py
@@ -0,0 +1,22 @@
+import json
+import subprocess
+import sys
+
+relations = ['test']
+
+failed_cases = []
+versions = ['7', '8']
+for rel in relations:
+    query = f'curl -XGET -u elastic:risingwave "http://localhost:9200/{rel}/_count"  -H "Content-Type: application/json"'
+    for v in versions:
+        es = 'elasticsearch{}'.format(v)
+        print(f"Running Query: {query} on {es}")
+        counts = subprocess.check_output(["docker", "compose", "exec", es, "bash", "-c", query])
+        counts = json.loads(counts)['count']
+        print("{} counts in {}_{}".format(counts, es, rel))
+        if counts < 1:
+            failed_cases.append(es + '_' + rel)
+
+if len(failed_cases) != 0:
+    print(f"Data check failed for case {failed_cases}")
+    sys.exit(1)
diff --git a/integration_tests/iceberg-sink/prepare.sh b/integration_tests/iceberg-sink/prepare.sh
new file mode 100755
index 0000000000000..f95aa2cbee250
--- /dev/null
+++ b/integration_tests/iceberg-sink/prepare.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# setup
+docker compose exec spark bash /spark-script/run-sql-file.sh create-table
diff --git a/integration_tests/iceberg-sink/sink_check.py b/integration_tests/iceberg-sink/sink_check.py
new file mode 100644
index 0000000000000..74c45c6d08bb2
--- /dev/null
+++ b/integration_tests/iceberg-sink/sink_check.py
@@ -0,0 +1,23 @@
+import subprocess
+from time import sleep
+
+sleep(60)
+
+query_sql = open("iceberg-query.sql").read()
+
+print("querying iceberg with presto sql: %s" % query_sql)
+
+query_output_file_name = "query_output.txt"
+
+query_output_file = open(query_output_file_name, "wb")
+
+subprocess.run(
+    ["docker", "compose", "exec", "presto", "presto-cli", "--server", "localhost:8080", "--execute", query_sql],
+    check=True, stdout=query_output_file)
+query_output_file.close()
+
+output_content = open(query_output_file_name).read()
+
+print(output_content)
+
+assert len(output_content.strip()) > 0
diff --git a/integration_tests/kafka-cdc-sink/pg_check b/integration_tests/kafka-cdc-sink/pg_check
deleted file mode 100644
index cd31705b2b725..0000000000000
--- a/integration_tests/kafka-cdc-sink/pg_check
+++ /dev/null
@@ -1 +0,0 @@
-counts,flinkcounts,types,flink_types
diff --git a/integration_tests/kafka-cdc-sink/sink_check.py b/integration_tests/kafka-cdc-sink/sink_check.py
new file mode 100644
index 0000000000000..b27472f0cacc1
--- /dev/null
+++ b/integration_tests/kafka-cdc-sink/sink_check.py
@@ -0,0 +1,24 @@
+import subprocess
+import sys
+from time import sleep
+
+# wait for one and a half minutes for the flink test pipeline
+print("wait for one minute for ingestion")
+sleep(60)
+
+relations = ['counts', 'flinkcounts', 'types', 'flink_types']
+
+failed_cases = []
+for rel in relations:
+    sql = f'SELECT COUNT(*) FROM {rel};'
+    print(f"Running SQL: {sql} on PG")
+    command = f'psql -U $POSTGRES_USER $POSTGRES_DB --tuples-only -c "{sql}"'
+    rows = subprocess.check_output(["docker", "exec", "postgres", "bash", "-c", command])
+    rows = int(rows.decode('utf8').strip())
+    print(f"{rows} rows in {rel}")
+    if rows < 1:
+        failed_cases.append(rel)
+
+if len(failed_cases) != 0:
+    print(f"Data check failed for case {failed_cases}")
+    sys.exit(1)
diff --git a/integration_tests/mindsdb/query_sink.sh b/integration_tests/mindsdb/query_sink.sh
deleted file mode 100644
index fdfd9bf910576..0000000000000
--- a/integration_tests/mindsdb/query_sink.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-set -x  # Enable printing of each command
-
-# The model creation may take a long time. Our estimate is 30 seconds. But it can be longer in lower-perf machines.
-sleep 30
-
-QUERY='SELECT rental_price FROM home_rentals_model WHERE number_of_bathrooms = 2 AND sqft = 1000;'
-psql -h localhost -p 55432 -U mindsdb -d mindsdb -c "$QUERY"
\ No newline at end of file
diff --git a/integration_tests/mindsdb/sink_check.py b/integration_tests/mindsdb/sink_check.py
new file mode 100644
index 0000000000000..20ab0db9f9137
--- /dev/null
+++ b/integration_tests/mindsdb/sink_check.py
@@ -0,0 +1,7 @@
+import subprocess
+
+# The model creation may take a long time. Our estimate is 30 seconds. But it can be longer in lower-perf machines.
+
+sql = "SELECT rental_price FROM home_rentals_model WHERE number_of_bathrooms = 2 AND sqft = 1000;"
+
+subprocess.run(["psql", "-h", "localhost", "-p", "55432", "-U", "mindsdb", "-d", "mindsdb", "-c", sql], check=True)
diff --git a/integration_tests/mysql-sink/create_mv.sql b/integration_tests/mysql-sink/create_mv.sql
index 72d6bf833c6e8..2cba41795922f 100644
--- a/integration_tests/mysql-sink/create_mv.sql
+++ b/integration_tests/mysql-sink/create_mv.sql
@@ -6,42 +6,3 @@ FROM
     user_behaviors
 GROUP BY
     target_id;
-
-CREATE SINK target_count_mysql_sink
-FROM
-    target_count WITH (
-        connector = 'jdbc',
-        jdbc.url = 'jdbc:mysql://mysql:3306/mydb?user=root&password=123456',
-        table.name = 'target_count',
-        type = 'upsert',
-        primary_key = 'target_id'
-    );
-
--- ingest the table back to RW
-CREATE TABLE rw_typed_data (
-    id BIGINT PRIMARY KEY,
-    varchar_column VARCHAR,
-    text_column TEXT,
-    integer_column INTEGER,
-    smallint_column SMALLINT,
-    bigint_column BIGINT,
-    decimal_column DECIMAL,
-    real_column REAL,
-    double_column DOUBLE PRECISION,
-    boolean_column BOOLEAN,
-    date_column DATE,
-    time_column TIME,
-    timestamp_column TIMESTAMP,
-    timestamptz_column TIMESTAMPTZ,
-    jsonb_column JSONB,
-    bytea_column BYTEA
-) WITH (
-    connector = 'mysql-cdc',
-    hostname = 'mysql',
-    port = '3306',
-    username = 'root',
-    password = '123456',
-    database.name = 'mydb',
-    table.name = 'data_types',
-    server.id = '3'
-);
diff --git a/integration_tests/mysql-sink/create_sink.sql b/integration_tests/mysql-sink/create_sink.sql
new file mode 100644
index 0000000000000..bfe9bf6c0b70e
--- /dev/null
+++ b/integration_tests/mysql-sink/create_sink.sql
@@ -0,0 +1,19 @@
+CREATE SINK target_count_mysql_sink
+FROM
+    target_count WITH (
+        connector = 'jdbc',
+        jdbc.url = 'jdbc:mysql://mysql:3306/mydb?user=root&password=123456',
+        table.name = 'target_count',
+        type = 'upsert',
+        primary_key = 'target_id'
+    );
+
+CREATE SINK data_types_mysql_sink
+FROM
+    data_types WITH (
+        connector = 'jdbc',
+        jdbc.url = 'jdbc:mysql://mysql:3306/mydb?user=root&password=123456',
+        table.name = 'data_types',
+        type = 'upsert',
+        primary_key = 'id'
+    );
diff --git a/integration_tests/mysql-sink/create_source.sql b/integration_tests/mysql-sink/create_source.sql
index eb13c5a37cf83..f049457aa3121 100644
--- a/integration_tests/mysql-sink/create_source.sql
+++ b/integration_tests/mysql-sink/create_source.sql
@@ -7,10 +7,11 @@ CREATE SOURCE user_behaviors (
     parent_target_type VARCHAR,
     parent_target_id VARCHAR
 ) WITH (
-    connector = 'kafka',
-    topic = 'user_behaviors',
-    properties.bootstrap.server = 'message_queue:29092',
-    scan.startup.mode = 'earliest'
+    connector = 'datagen',
+    fields.user_id.kind = 'sequence',
+    fields.user_id.start = 1,
+    fields.user_id.end = 100,
+    datagen.rows.per.second = '100'
 ) FORMAT PLAIN ENCODE JSON;
 
 CREATE TABLE data_types (
@@ -32,16 +33,6 @@ CREATE TABLE data_types (
     bytea_column BYTEA
 );
 
-CREATE SINK data_types_mysql_sink
-FROM
-    data_types WITH (
-        connector = 'jdbc',
-        jdbc.url = 'jdbc:mysql://mysql:3306/mydb?user=root&password=123456',
-        table.name = 'data_types',
-        type = 'upsert',
-        primary_key = 'id'
-    );
-
 INSERT INTO data_types (id, varchar_column, text_column, integer_column, smallint_column, bigint_column, decimal_column, real_column, double_column, boolean_column, date_column, time_column, timestamp_column, timestamptz_column, jsonb_column, bytea_column)
 VALUES
     (1, 'Varchar value 1', 'Text value 1', 123, 456, 789, 12.34, 56.78, 90.12, TRUE, '2023-05-22', '12:34:56', '2023-05-22 12:34:56', '2023-05-22T12:34:56Z', '{"key": "value"}', E'\\xDEADBEEF'),
diff --git a/integration_tests/mysql-sink/data_check b/integration_tests/mysql-sink/data_check
deleted file mode 100644
index 0f8b2d5166847..0000000000000
--- a/integration_tests/mysql-sink/data_check
+++ /dev/null
@@ -1 +0,0 @@
-user_behaviors,target_count,rw_typed_data
\ No newline at end of file
diff --git a/integration_tests/mysql-sink/docker-compose.yml b/integration_tests/mysql-sink/docker-compose.yml
index 9b946514eb4aa..97d3d78ce4cb0 100644
--- a/integration_tests/mysql-sink/docker-compose.yml
+++ b/integration_tests/mysql-sink/docker-compose.yml
@@ -21,12 +21,8 @@ services:
     extends:
       file: ../../docker/docker-compose.yml
       service: prometheus-0
-  message_queue:
-    extends:
-      file: ../../docker/docker-compose.yml
-      service: message_queue
   mysql:
-    image: mysql:8.0
+    image: mysql:latest
     ports:
       - "3306:3306"
     environment:
@@ -34,33 +30,14 @@ services:
       - MYSQL_USER=mysqluser
       - MYSQL_PASSWORD=mysqlpw
       - MYSQL_DATABASE=mydb
+    volumes:
+      - "./mysql_prepare.sql:/mysql_prepare.sql"
     healthcheck:
       test: [ "CMD-SHELL", "mysqladmin ping -h 127.0.0.1 -u root -p123456" ]
       interval: 5s
       timeout: 5s
       retries: 5
     container_name: mysql
-  datagen:
-    build: ../datagen
-    depends_on: [message_queue]
-    command:
-      - /bin/sh
-      - -c
-      - /datagen --mode clickstream --qps 2 kafka --brokers message_queue:29092
-    restart: always
-    container_name: datagen
-  prepare_mysql:
-    image: mysql:8.0
-    depends_on:
-      - mysql
-    command:
-      - /bin/sh
-      - -c
-      - "mysql -p123456 -h mysql mydb < mysql_prepare.sql"
-    volumes:
-      - "./mysql_prepare.sql:/mysql_prepare.sql"
-    container_name: prepare_mysql
-    restart: on-failure
 volumes:
   risingwave-standalone:
     external: false
diff --git a/integration_tests/mysql-sink/prepare.sh b/integration_tests/mysql-sink/prepare.sh
new file mode 100755
index 0000000000000..9f2e93d1b40a5
--- /dev/null
+++ b/integration_tests/mysql-sink/prepare.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -euo pipefail
+
+sleep 10
+
+# setup mysql
+docker compose exec mysql bash -c "mysql -p123456 -h mysql mydb < mysql_prepare.sql"
diff --git a/integration_tests/mysql-sink/query.sql b/integration_tests/mysql-sink/query.sql
deleted file mode 100644
index 6fbe4cc96813e..0000000000000
--- a/integration_tests/mysql-sink/query.sql
+++ /dev/null
@@ -1,13 +0,0 @@
-SELECT
-    *
-FROM
-    target_count
-LIMIT
-    10;
-
-SELECT
-    *
-FROM
-    data_types
-LIMIT
-    10;
\ No newline at end of file
diff --git a/integration_tests/mysql-sink/sink_check.py b/integration_tests/mysql-sink/sink_check.py
new file mode 100644
index 0000000000000..b7cf590c38d09
--- /dev/null
+++ b/integration_tests/mysql-sink/sink_check.py
@@ -0,0 +1,23 @@
+import subprocess
+import sys
+
+relations = ['target_count', 'data_types']
+
+failed_cases = []
+for rel in relations:
+    sql = f'SELECT COUNT(*) FROM {rel};'
+    print(f"Running SQL: {sql} ON MYSQL")
+    command = f'mysql -p123456 mydb -e "{sql}"'
+    output = subprocess.check_output(
+        ["docker", "compose", "exec", "mysql", "bash", "-c", command])
+    # output:
+    # COUNT(*)
+    # 0
+    rows = int(output.decode('utf-8').split('\n')[1])
+    print(f"{rows} rows in {rel}")
+    if rows < 1:
+        failed_cases.append(rel)
+
+if len(failed_cases) != 0:
+    print(f"Data check failed for case {failed_cases}")
+    sys.exit(1)
diff --git a/integration_tests/postgres-sink/README.md b/integration_tests/postgres-sink/README.md
index 3066b8d37e3f9..c0d4d6956aff9 100644
--- a/integration_tests/postgres-sink/README.md
+++ b/integration_tests/postgres-sink/README.md
@@ -5,9 +5,8 @@ This demo showcases how to sink RisingWave's data to an external Postgres. The d
 Here's what this demo does:
 
 1. `docker compose up -d`: Start the cluster.
-2. After 20-30s: `create_source.sql`.
-3. After 10s: `create_mv.sql`.
-4. After another 10s, the tester will check if the source has ingested some data by creating a materialized view upon the source. It also checks if the MV created in the 3rd step has some data.
+2. After 20-30s: `create_source.sql`, `create_mv.sql`, `create_sink.sql`.
+3. After another 30s, the tester will check if the ingestion is successful by `SELECT COUNT(*) FROM target_count;` in Postgres.
 
 To connect to the Postgres on your local PC:
 
diff --git a/integration_tests/postgres-sink/create_mv.sql b/integration_tests/postgres-sink/create_mv.sql
index ef403f4b507c5..2cba41795922f 100644
--- a/integration_tests/postgres-sink/create_mv.sql
+++ b/integration_tests/postgres-sink/create_mv.sql
@@ -6,45 +6,3 @@ FROM
     user_behaviors
 GROUP BY
     target_id;
-
-CREATE SINK target_count_postgres_sink
-FROM
-    target_count WITH (
-        connector = 'jdbc',
-        jdbc.url = 'jdbc:postgresql://postgres:5432/mydb?user=myuser&password=123456',
-        table.name = 'target_count',
-        type = 'upsert',
-        primary_key = 'target_id'
-    );
-
--- ingest back to RW
-CREATE table rw_typed_data (
-    id BIGINT PRIMARY KEY,
-    varchar_column VARCHAR,
-    text_column TEXT,
-    integer_column INTEGER,
-    smallint_column SMALLINT,
-    bigint_column BIGINT,
-    decimal_column DECIMAL,
-    real_column REAL,
-    double_column DOUBLE PRECISION,
-    boolean_column BOOLEAN,
-    date_column DATE,
-    time_column TIME,
-    timestamp_column TIMESTAMP,
-    timestamptz_column TIMESTAMPTZ,
-    interval_column INTERVAL,
-    jsonb_column JSONB,
-    bytea_column BYTEA,
-    array_column VARCHAR[]
-) WITH (
-    connector = 'postgres-cdc',
-    hostname = 'postgres',
-    port = '5432',
-    username = 'myuser',
-    password = '123456',
-    database.name = 'mydb',
-    schema.name = 'public',
-    table.name = 'data_types',
-    slot.name = 'data_types'
-);
diff --git a/integration_tests/postgres-sink/create_sink.sql b/integration_tests/postgres-sink/create_sink.sql
new file mode 100644
index 0000000000000..e01ad2760cc2e
--- /dev/null
+++ b/integration_tests/postgres-sink/create_sink.sql
@@ -0,0 +1,20 @@
+CREATE SINK target_count_postgres_sink
+FROM
+    target_count WITH (
+        connector = 'jdbc',
+        jdbc.url = 'jdbc:postgresql://postgres:5432/mydb?user=myuser&password=123456',
+        table.name = 'target_count',
+        type = 'upsert',
+        primary_key = 'target_id'
+    );
+
+-- sink data_type table to pg
+CREATE SINK data_types_postgres_sink
+FROM
+    data_types WITH (
+    connector = 'jdbc',
+    jdbc.url = 'jdbc:postgresql://postgres:5432/mydb?user=myuser&password=123456',
+    table.name = 'data_types',
+    type='upsert',
+    primary_key = 'id'
+);
diff --git a/integration_tests/postgres-sink/create_source.sql b/integration_tests/postgres-sink/create_source.sql
index 7303e350f32df..6840f8cb379c1 100644
--- a/integration_tests/postgres-sink/create_source.sql
+++ b/integration_tests/postgres-sink/create_source.sql
@@ -7,10 +7,11 @@ CREATE SOURCE user_behaviors (
     parent_target_type VARCHAR,
     parent_target_id VARCHAR
 ) WITH (
-    connector = 'kafka',
-    topic = 'user_behaviors',
-    properties.bootstrap.server = 'message_queue:29092',
-    scan.startup.mode = 'earliest'
+    connector = 'datagen',
+    fields.user_id.kind = 'sequence',
+    fields.user_id.start = 1,
+    fields.user_id.end = 100,
+    datagen.rows.per.second = '100'
 ) FORMAT PLAIN ENCODE JSON;
 
 CREATE TABLE data_types (
@@ -34,17 +35,6 @@ CREATE TABLE data_types (
     array_column VARCHAR[]
 );
 
--- sink data_type table to pg
-CREATE SINK data_types_postgres_sink
-FROM
-    data_types WITH (
-    connector = 'jdbc',
-    jdbc.url = 'jdbc:postgresql://postgres:5432/mydb?user=myuser&password=123456',
-    table.name = 'data_types',
-    type='upsert',
-    primary_key = 'id'
-);
-
 INSERT INTO data_types (id, varchar_column, text_column, integer_column, smallint_column, bigint_column, decimal_column, real_column, double_column, boolean_column, date_column, time_column, timestamp_column, timestamptz_column, interval_column, jsonb_column, bytea_column, array_column)
 VALUES
     (1, 'Varchar value 1', 'Text value 1', 123, 456, 789, 12.34, 56.78, 90.12, TRUE, '2023-05-22', '12:34:56', '2023-05-22 12:34:56', '2023-05-22 12:34:56+00:00', '1 day', '{"key": "value"}', E'\\xDEADBEEF', ARRAY['Value 1', 'Value 2']),
@@ -52,4 +42,3 @@ VALUES
     (3, 'Varchar value 3', 'Text value 3', 345, 678, 901, 34.56, 78.90, 12.34, TRUE, '2023-05-24', '12:34:56', '2023-05-24 12:34:56', '2023-05-24 12:34:56+00:00', '3 days', '{"key": "value3"}', E'\\xCAFEBABE', ARRAY['Value 5', 'Value 6']),
     (4, 'Varchar value 4', 'Text value 4', 456, 789, 012, 45.67, 89.01, 23.45, FALSE, '2023-05-25', '23:45:01', '2023-05-25 23:45:01', '2023-05-25 23:45:01+00:00', '4 days', '{"key": "value4"}', E'\\xBABEC0DE', ARRAY['Value 7', 'Value 8']),
     (5, 'Varchar value 5', 'Text value 5', 567, 890, 123, 56.78, 90.12, 34.56, TRUE, '2023-05-26', '12:34:56', '2023-05-26 12:34:56', '2023-05-26 12:34:56+00:00', '5 days', '{"key": "value5"}', E'\\xDEADBABE', ARRAY['Value 9', 'Value 10']);
-
diff --git a/integration_tests/postgres-sink/data_check b/integration_tests/postgres-sink/data_check
deleted file mode 100644
index 0f8b2d5166847..0000000000000
--- a/integration_tests/postgres-sink/data_check
+++ /dev/null
@@ -1 +0,0 @@
-user_behaviors,target_count,rw_typed_data
\ No newline at end of file
diff --git a/integration_tests/postgres-sink/docker-compose.yml b/integration_tests/postgres-sink/docker-compose.yml
index a5b8465d7c915..e443965c2e5be 100644
--- a/integration_tests/postgres-sink/docker-compose.yml
+++ b/integration_tests/postgres-sink/docker-compose.yml
@@ -21,23 +21,10 @@ services:
     extends:
       file: ../../docker/docker-compose.yml
       service: prometheus-0
-  message_queue:
-    extends:
-      file: ../../docker/docker-compose.yml
-      service: message_queue
-  datagen:
-    build: ../datagen
-    depends_on: [message_queue]
-    command:
-      - /bin/sh
-      - -c
-      - /datagen --mode clickstream --qps 2 kafka --brokers message_queue:29092
-    restart: always
-    container_name: datagen
   # Use this command to connect to the DB from outside the container:
   #   docker exec postgres psql --username=myuser --dbname=mydb
   postgres:
-    image: postgres
+    image: postgres:latest
     environment:
       - POSTGRES_USER=myuser
       - POSTGRES_PASSWORD=123456
@@ -50,20 +37,10 @@ services:
       timeout: 5s
       retries: 5
     command: [ "postgres", "-c", "wal_level=logical" ]
-    restart: always
-    container_name: postgres
-  prepare_postgres:
-    image: postgres
-    depends_on:
-      - postgres
-    command:
-      - /bin/sh
-      - -c
-      - "psql postgresql://myuser:123456@postgres:5432/mydb < postgres_prepare.sql"
     volumes:
       - "./postgres_prepare.sql:/postgres_prepare.sql"
-    container_name: prepare_postgres
-    restart: on-failure
+    restart: always
+    container_name: postgres
 volumes:
   risingwave-standalone:
     external: false
diff --git a/integration_tests/postgres-sink/prepare.sh b/integration_tests/postgres-sink/prepare.sh
new file mode 100755
index 0000000000000..ab1f2ddb465fc
--- /dev/null
+++ b/integration_tests/postgres-sink/prepare.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# setup postgres
+docker compose exec postgres bash -c "psql postgresql://myuser:123456@postgres:5432/mydb < postgres_prepare.sql"
diff --git a/integration_tests/postgres-sink/query.sql b/integration_tests/postgres-sink/query.sql
deleted file mode 100644
index e09c66a255f10..0000000000000
--- a/integration_tests/postgres-sink/query.sql
+++ /dev/null
@@ -1,6 +0,0 @@
-SELECT
-    *
-FROM
-    target_count
-LIMIT
-    10;
\ No newline at end of file
diff --git a/integration_tests/postgres-sink/sink_check.py b/integration_tests/postgres-sink/sink_check.py
new file mode 100644
index 0000000000000..606b78424a262
--- /dev/null
+++ b/integration_tests/postgres-sink/sink_check.py
@@ -0,0 +1,21 @@
+import sys
+import subprocess
+
+
+relations = ['target_count', 'data_types']
+
+failed_cases = []
+for rel in relations:
+    sql = f'SELECT COUNT(*) FROM {rel};'
+    print(f"Running SQL: {sql} ON PG")
+    command = f'psql -U $POSTGRES_USER $POSTGRES_DB --tuples-only -c "{sql}"'
+    rows = subprocess.check_output(
+        ["docker", "compose", "exec", "postgres", "bash", "-c", command])
+    rows = int(rows.decode('utf-8').strip())
+    print(f"{rows} rows in {rel}")
+    if rows < 1:
+        failed_cases.append(rel)
+
+if len(failed_cases) != 0:
+    print(f"Data check failed for case {failed_cases}")
+    sys.exit(1)
diff --git a/integration_tests/redis-sink/sink_check b/integration_tests/redis-sink/sink_check
deleted file mode 100644
index 80a05155b821f..0000000000000
--- a/integration_tests/redis-sink/sink_check
+++ /dev/null
@@ -1 +0,0 @@
-user_id,UserID,types_id,TYPESID
diff --git a/integration_tests/redis-sink/sink_check.py b/integration_tests/redis-sink/sink_check.py
new file mode 100644
index 0000000000000..24debe867a9bf
--- /dev/null
+++ b/integration_tests/redis-sink/sink_check.py
@@ -0,0 +1,22 @@
+import sys
+import subprocess
+
+relations = ['user_id', 'UserID', 'types_id', 'TYPESID']
+
+failed_cases = []
+for rel in relations:
+    query = f"*{rel}*"
+    print(f"Running query: scan {query} on Redis")
+    output = subprocess.Popen(["docker", "compose", "exec", "redis", "redis-cli", "--scan", "--pattern", query],
+                              stdout=subprocess.PIPE)
+    rows = subprocess.check_output(["wc", "-l"], stdin=output.stdout)
+    output.stdout.close()
+    output.wait()
+    rows = int(rows.decode('utf8').strip())
+    print(f"{rows} keys in '*{rel}*'")
+    if rows < 1:
+        failed_cases.append(rel)
+
+if len(failed_cases) != 0:
+    print(f"Data check failed for case {failed_cases}")
+    sys.exit(1)
diff --git a/integration_tests/scripts/check_data.py b/integration_tests/scripts/check_data.py
index 0575615df8ef3..9c449d5bff43b 100644
--- a/integration_tests/scripts/check_data.py
+++ b/integration_tests/scripts/check_data.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python3
 
-# Every demo directory contains a 'data_check' file that lists the relations (either source or mv)
+# Every sink demo directory contains a 'sink_check.py' file that used to check test,
+# and other demo directory contains a 'data_check' file that lists the relations (either source or mv)
 # that are expected to have >=1 rows. This script runs the checks by creating a materialized view over it,
 # and verify the rows count in the view.
 
@@ -52,35 +53,67 @@ def run_psql(sql):
                                     "-d", "dev", "-U", "root", "--tuples-only", "-c", sql])
 
 
+def data_check(data_check_file: str):
+    with open(data_check_file) as f:
+        relations = f.read().strip().split(",")
+        for rel in relations:
+            create_mv(rel)
+            time.sleep(20)
+        failed_cases = []
+        for rel in relations:
+            if not check_mv(rel):
+                failed_cases.append(rel)
+        if len(failed_cases) != 0:
+            raise Exception("Data check failed for case {}".format(failed_cases))
+
+
+def sink_check(demo_dir: str, sink_check_file: str):
+    print("sink created. Wait for half min time for ingestion")
+
+    # wait for half min ingestion
+    time.sleep(30)
+    subprocess.run(["python3", sink_check_file], cwd=demo_dir, check=True)
+
+
+def cdc_check(cdc_check_file: str, upstream: str):
+    with open(cdc_check_file) as f:
+        print("Check cdc table with upstream {}".format(upstream))
+        relations = f.read().strip().split(",")
+        for rel in relations:
+            check_cdc_table(rel)
+
+
+def test_check(demo: str, upstream: str, need_data_check=True, need_sink_check=False):
+    file_dir = dirname(abspath(__file__))
+    project_dir = dirname(file_dir)
+    demo_dir = os.path.join(project_dir, demo)
+
+    data_check_file = os.path.join(demo_dir, 'data_check')
+    if need_data_check or os.path.exists(data_check_file):
+        data_check(data_check_file)
+    else:
+        print(f"skip data check for {demo}")
+
+    sink_check_file = os.path.join(demo_dir, 'sink_check.py')
+    if need_sink_check or os.path.exists(sink_check_file):
+        sink_check(demo_dir, sink_check_file)
+    else:
+        print(f"skip sink check for {demo}")
+
+    cdc_check_file = os.path.join(demo_dir, 'cdc_check')
+    if os.path.exists(cdc_check_file):
+        cdc_check(cdc_check_file, upstream)
+    else:
+        print(f"skip cdc check for {demo}")
+
+
 demo = sys.argv[1]
 upstream = sys.argv[2]  # mysql, postgres, etc. see scripts/integration_tests.sh
-if demo in ['docker', 'iceberg-sink','clickhouse-sink', 'iceberg-cdc', 'kafka-cdc-sink', 'cassandra-and-scylladb-sink', 'elasticsearch-sink', 'redis-sink', 'big-query-sink']:
+if demo in ['docker', 'iceberg-cdc']:
     print('Skip for running test for `%s`' % demo)
     sys.exit(0)
 
-file_dir = dirname(abspath(__file__))
-project_dir = dirname(file_dir)
-demo_dir = os.path.join(project_dir, demo)
-data_check_file = os.path.join(demo_dir, 'data_check')
-with open(data_check_file) as f:
-    relations = f.read().strip().split(",")
-    for rel in relations:
-        create_mv(rel)
-        time.sleep(20)
-    failed_cases = []
-    for rel in relations:
-        if not check_mv(rel):
-            failed_cases.append(rel)
-    if len(failed_cases) != 0:
-        raise Exception("Data check failed for case {}".format(failed_cases))
-
-cdc_check_file = os.path.join(demo_dir, 'cdc_check')
-if not os.path.exists(cdc_check_file):
-    print("Skip cdc check for {}".format(demo))
-    sys.exit(0)
-
-with open(cdc_check_file) as f:
-    print("Check cdc table with upstream {}".format(upstream))
-    relations = f.read().strip().split(",")
-    for rel in relations:
-        check_cdc_table(rel)
+if 'sink' in demo:
+    test_check(demo, upstream, need_data_check=False, need_sink_check=True)
+else:
+    test_check(demo, upstream, need_data_check=True, need_sink_check=False)
diff --git a/integration_tests/scripts/run_demos.py b/integration_tests/scripts/run_demos.py
index c43c4a4cc1556..87967daa50b2c 100644
--- a/integration_tests/scripts/run_demos.py
+++ b/integration_tests/scripts/run_demos.py
@@ -6,7 +6,6 @@
 import subprocess
 from time import sleep
 import argparse
-import json
 
 
 def run_sql_file(f: str, dir: str):
@@ -20,16 +19,7 @@ def run_sql_file(f: str, dir: str):
         sys.exit(1)
 
 
-def run_bash_file(f: str, dir: str):
-    print("Running Bash file: {}".format(f))
-    # ON_ERROR_STOP=1 will let psql return error code when the query fails.
-    # https://stackoverflow.com/questions/37072245/check-return-status-of-psql-command-in-unix-shell-scripting
-    proc = subprocess.run(["bash", f], check=True, cwd=dir)
-    if proc.returncode != 0:
-        sys.exit(1)
-
-
-def run_demo(demo: str, format: str, wait_time = 40):
+def run_demo(demo: str, format: str, wait_time=40):
     file_dir = dirname(abspath(__file__))
     project_dir = dirname(file_dir)
     demo_dir = os.path.join(project_dir, demo)
@@ -38,7 +28,11 @@ def run_demo(demo: str, format: str, wait_time = 40):
     subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True)
     sleep(wait_time)
 
-    sql_files = ['create_source.sql', 'create_mv.sql', 'query.sql']
+    prepare_file = 'prepare.sh'
+    if os.path.exists(os.path.join(demo_dir, prepare_file)):
+        subprocess.run(["bash", prepare_file], cwd=demo_dir, check=True)
+
+    sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql', 'query.sql']
     for fname in sql_files:
         if format == 'protobuf':
             sql_file = os.path.join(demo_dir, "pb", fname)
@@ -53,50 +47,7 @@ def run_demo(demo: str, format: str, wait_time = 40):
             continue
         run_sql_file(sql_file, demo_dir)
         sleep(10)
-    # Run query_sink.sh if it exists.
-    query_sink_file = os.path.join(demo_dir,  'query_sink.sh')
-    if os.path.isfile(query_sink_file):
-        run_bash_file(query_sink_file, demo_dir)
-
-
-def run_kafka_cdc_demo():
-    demo = "kafka-cdc-sink"
-    file_dir = dirname(abspath(__file__))
-    project_dir = dirname(file_dir)
-    demo_dir = os.path.join(project_dir, demo)
-    print("Running demo: kafka-cdc-sink")
-
-    subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True)
-    sleep(40)
-
-    subprocess.run(["bash","./prepare.sh"], cwd=demo_dir, check=True)
 
-    sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql']
-    for fname in sql_files:
-        sql_file = os.path.join(demo_dir,  fname)
-        print("executing sql: ", open(sql_file).read())
-        run_sql_file(sql_file, demo_dir)
-
-    print("sink created. Wait for 2 min time for ingestion")
-
-    # wait for two minutes ingestion
-    sleep(120)
-
-    pg_check_file = os.path.join(demo_dir, 'pg_check')
-    with open(pg_check_file) as f:
-        relations = f.read().strip().split(",")
-        failed_cases = []
-        for rel in relations:
-            sql = 'SELECT COUNT(*) FROM {}'.format(rel)
-            print("Running SQL: {} on PG".format(sql))
-            command = 'psql -U $POSTGRES_USER $POSTGRES_DB --tuples-only -c "{}"'.format(sql)
-            rows = subprocess.check_output(["docker", "exec", "postgres", "bash", "-c", command])
-            rows = int(rows.decode('utf8').strip())
-            print("{} rows in {}".format(rows, rel))
-            if rows < 1:
-                failed_cases.append(rel)
-        if len(failed_cases) != 0:
-            raise Exception("Data check failed for case {}".format(failed_cases))
 
 def iceberg_cdc_demo():
     demo = "iceberg-cdc"
@@ -106,263 +57,6 @@ def iceberg_cdc_demo():
     print("Running demo: iceberg-cdc")
     subprocess.run(["bash","./run_test.sh"], cwd=demo_dir, check=True)
 
-def run_iceberg_demo():
-    demo = "iceberg-sink"
-    file_dir = dirname(abspath(__file__))
-    project_dir = dirname(file_dir)
-    demo_dir = os.path.join(project_dir, demo)
-    print("Running demo: iceberg-sink")
-
-    subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True)
-    sleep(40)
-
-    subprocess.run(["docker", "compose", "exec", "spark", "bash", "/spark-script/run-sql-file.sh", "create-table"],
-                   cwd=demo_dir, check=True)
-
-    sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql']
-    for fname in sql_files:
-        sql_file = os.path.join(demo_dir,  fname)
-        print("executing sql: ", open(sql_file).read())
-        run_sql_file(sql_file, demo_dir)
-        sleep(10)
-
-    print("sink created. Wait for 2 min time for ingestion")
-
-    # wait for two minutes ingestion
-    sleep(120)
-
-    query_sql = open(os.path.join(demo_dir, "iceberg-query.sql")).read()
-
-    print("querying iceberg with presto sql: %s" % query_sql)
-
-    query_output_file_name = "query_outout.txt"
-
-    query_output_file = open(query_output_file_name, "wb")
-
-    subprocess.run(["docker", "compose", "exec", "presto", "presto-cli", "--server", "localhost:8080", "--execute", query_sql],
-                   cwd=demo_dir, check=True, stdout=query_output_file)
-    query_output_file.close()
-
-    output_content = open(query_output_file_name).read()
-
-    print(output_content)
-
-    assert len(output_content.strip()) > 0
-
-def run_clickhouse_demo():
-    demo = "clickhouse-sink"
-    file_dir = dirname(abspath(__file__))
-    project_dir = dirname(file_dir)
-    demo_dir = os.path.join(project_dir, demo)
-    print("Running demo: clickhouse-sink")
-
-    subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True)
-    sleep(40)
-
-
-    subprocess.run(["docker", "compose", "exec", "clickhouse-server", "bash", "/opt/clickhouse/clickhouse-sql/run-sql-file.sh", "create_clickhouse_table"],
-                   cwd=demo_dir, check=True)
-
-    sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql']
-    for fname in sql_files:
-        sql_file = os.path.join(demo_dir,  fname)
-        print("executing sql: ", open(sql_file).read())
-        run_sql_file(sql_file, demo_dir)
-        sleep(10)
-
-    print("sink created. Wait for 2 min time for ingestion")
-
-    # wait for two minutes ingestion
-    sleep(120)
-
-    query_output_file_name = "query_outout.txt"
-
-    query_output_file = open(query_output_file_name, "wb")
-
-    subprocess.run(["docker", "compose", "exec", "clickhouse-server", "bash", "/opt/clickhouse/clickhouse-sql/run-sql-file.sh", "clickhouse_query"],
-                   cwd=demo_dir, check=True, stdout=query_output_file)
-    query_output_file.close()
-
-    output_content = open(query_output_file_name).read()
-
-    print(output_content)
-
-    assert len(output_content.strip()) > 0
-
-def run_cassandra_and_scylladb_sink_demo():
-    demo = "cassandra-and-scylladb-sink"
-    file_dir = dirname(abspath(__file__))
-    project_dir = dirname(file_dir)
-    demo_dir = os.path.join(project_dir, demo)
-    print("Running demo: {}".format(demo))
-
-    subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True)
-    print("wait two min for cassandra and scylladb to start up")
-    sleep(120)
-
-    dbs = ['cassandra', 'scylladb']
-    for db in dbs:
-        subprocess.run(["docker", "compose", "exec", db, "cqlsh", "-f", "prepare_cassandra_and_scylladb.sql"], cwd=demo_dir, check=True)
-
-    sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql']
-    for fname in sql_files:
-        sql_file = os.path.join(demo_dir,  fname)
-        print("executing sql: ", open(sql_file).read())
-        run_sql_file(sql_file, demo_dir)
-
-    print("sink created. Wait for 1 min time for ingestion")
-
-    # wait for one minutes ingestion
-    sleep(60)
-
-    sink_check_file = os.path.join(demo_dir, 'sink_check')
-    with open(sink_check_file) as f:
-        relations = f.read().strip().split(",")
-        failed_cases = []
-        for rel in relations:
-            sql = 'select count(*) from {};'.format(rel)
-            for db in dbs:
-                print("Running SQL: {} on {}".format(sql, db))
-                query_output_file_name = os.path.join(demo_dir, "query_{}_outout.txt".format(db))
-                query_output_file = open(query_output_file_name, "wb+")
-
-                command = "docker compose exec scylladb cqlsh -e"
-                subprocess.run(["docker", "compose", "exec", db, "cqlsh", "-e", sql], cwd=demo_dir, check=True, stdout=query_output_file)
-
-                # output file:
-                #
-                #  count
-                # -------
-                #   1000
-                #
-                # (1 rows)
-                query_output_file.seek(0)
-                lines = query_output_file.readlines()
-                query_output_file.close()
-                assert len(lines) >= 6
-                assert lines[1].decode('utf-8').strip().lower() == 'count'
-                rows = int(lines[3].decode('utf-8').strip())
-                print("{} rows in {}.{}".format(rows, db, rel))
-                if rows < 1:
-                    failed_cases.append(db + "_" + rel)
-        if len(failed_cases) != 0:
-            raise Exception("Data check failed for case {}".format(failed_cases))
-
-def run_elasticsearch_sink_demo():
-    demo = "elasticsearch-sink"
-    file_dir = dirname(abspath(__file__))
-    project_dir = dirname(file_dir)
-    demo_dir = os.path.join(project_dir, demo)
-    print("Running demo: {}".format(demo))
-
-    subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True)
-    sleep(60)
-
-    sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql']
-    for fname in sql_files:
-        sql_file = os.path.join(demo_dir,  fname)
-        print("executing sql: ", open(sql_file).read())
-        run_sql_file(sql_file, demo_dir)
-
-    print("sink created. Wait for half min time for ingestion")
-
-    # wait for half min ingestion
-    sleep(30)
-
-    versions = ['7', '8']
-    sink_check_file = os.path.join(demo_dir, 'sink_check')
-    with open(sink_check_file) as f:
-        relations = f.read().strip().split(",")
-        failed_cases = []
-        for rel in relations:
-            query = 'curl -XGET -u elastic:risingwave "http://localhost:9200/{}/_count"  -H "Content-Type: application/json"'.format(rel)
-            for v in versions:
-                es = 'elasticsearch{}'.format(v)
-                print("Running Query: {} on {}".format(query, es))
-                counts = subprocess.check_output(["docker", "compose", "exec", es, "bash", "-c", query], cwd=demo_dir)
-                counts = json.loads(counts)['count']
-                print("{} counts in {}_{}".format(counts, es, rel))
-                if counts < 1:
-                    failed_cases.append(es + '_' + rel)
-        if len(failed_cases) != 0:
-            raise Exception("Data check failed for case {}".format(failed_cases))
-
-def run_redis_demo():
-    demo = "redis-sink"
-    file_dir = dirname(abspath(__file__))
-    project_dir = dirname(file_dir)
-    demo_dir = os.path.join(project_dir, demo)
-    print("Running demo: {}".format(demo))
-
-    subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True)
-    sleep(40)
-
-    sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql']
-    for fname in sql_files:
-        sql_file = os.path.join(demo_dir,  fname)
-        print("executing sql: ", open(sql_file).read())
-        run_sql_file(sql_file, demo_dir)
-
-    sleep(40)
-    sink_check_file = os.path.join(demo_dir, 'sink_check')
-    with open(sink_check_file) as f:
-        relations = f.read().strip().split(",")
-        failed_cases = []
-        for rel in relations:
-            query = "*{}*".format(rel)
-            print("Running query: scan on Redis".format(query))
-            output = subprocess.Popen(["docker", "compose", "exec", "redis", "redis-cli", "--scan", "--pattern", query], cwd=demo_dir, stdout=subprocess.PIPE)
-            rows = subprocess.check_output(["wc", "-l"], cwd=demo_dir, stdin=output.stdout)
-            output.stdout.close()
-            output.wait()
-            rows = int(rows.decode('utf8').strip())
-            print("{} keys in '*{}*'".format(rows, rel))
-            if rows < 1:
-                failed_cases.append(rel)
-        if len(failed_cases) != 0:
-            raise Exception("Data check failed for case {}".format(failed_cases))
-
-def run_bigquery_demo():
-    demo = "big-query-sink"
-    file_dir = dirname(abspath(__file__))
-    project_dir = dirname(file_dir)
-    demo_dir = os.path.join(project_dir, demo)
-    print("Running demo: {}".format(demo))
-
-    subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True)
-    subprocess.run(["docker", "compose", "exec", "gcloud-cli", "gcloud", "auth", "login", "--cred-file=/gcp-rwctest.json"], cwd=demo_dir, check=True)
-    subprocess.run(["docker", "compose", "exec", "gcloud-cli", "gcloud", "config", "set", "project", "rwctest"], cwd=demo_dir, check=True)
-
-    bq_prepare_file = os.path.join(demo_dir, 'bq_prepare.sql')
-    bq_prepare_content = open(bq_prepare_file).read().strip()
-    subprocess.run(["docker", "compose", "exec", "gcloud-cli", "bq", "query", "--use_legacy_sql=false", bq_prepare_content], cwd=demo_dir, check=True)
-    sleep(30)
-
-    sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql']
-    for fname in sql_files:
-        sql_file = os.path.join(demo_dir, "append-only-sql/"+fname)
-        print("executing sql: ", open(sql_file).read())
-        run_sql_file(sql_file, demo_dir)
-
-    sleep(30)
-    sink_check_file = os.path.join(demo_dir, 'sink_check')
-    with open(sink_check_file) as f:
-        relations = f.read().strip().split(",")
-        failed_cases = []
-        for rel in relations:
-            sql = "SELECT COUNT(*) AS count FROM `{}`".format(rel)
-            print("run sql {} on Bigquery".format(sql))
-            rows = subprocess.check_output(["docker", "compose", "exec", "gcloud-cli", "bq", "query", "--use_legacy_sql=false", "--format=json", sql], cwd=demo_dir)
-            rows = int(json.loads(rows.decode("utf-8").strip())[0]['count'])
-            print("{} rows in {}".format(rows, rel))
-            if rows < 1:
-                failed_cases.append(rel)
-
-            drop_sql = "DROP TABLE IF EXISTS `{}`".format(rel)
-            subprocess.run(["docker", "compose", "exec", "gcloud-cli", "bq", "query", "--use_legacy_sql=false", drop_sql], cwd=demo_dir, check=True)
-
-        if len(failed_cases) != 0:
-            raise Exception("Data check failed for case {}".format(failed_cases))
 
 arg_parser = argparse.ArgumentParser(description="Run the demo")
 arg_parser.add_argument(
@@ -379,24 +73,7 @@ def run_bigquery_demo():
 # disable telemetry in env
 os.environ['ENABLE_TELEMETRY'] = "false"
 
-if args.case == "iceberg-sink":
-    if args.format == "protobuf":
-        print("skip protobuf test for iceberg-sink")
-    else:
-        run_iceberg_demo()
-elif args.case == "clickhouse-sink":
-    run_clickhouse_demo()
-elif args.case == "iceberg-cdc":
+if args.case == "iceberg-cdc":
     iceberg_cdc_demo()
-elif args.case == "kafka-cdc-sink":
-    run_kafka_cdc_demo()
-elif args.case == "cassandra-and-scylladb-sink":
-    run_cassandra_and_scylladb_sink_demo()
-elif args.case == "elasticsearch-sink":
-    run_elasticsearch_sink_demo()
-elif args.case == "redis-sink":
-    run_redis_demo()
-elif args.case == "big-query-sink":
-    run_bigquery_demo()
 else:
     run_demo(args.case, args.format)
diff --git a/integration_tests/tidb-cdc-sink/create_mv.sql b/integration_tests/tidb-cdc-sink/create_mv.sql
index 242c42846bd5a..95aef4c4883f3 100644
--- a/integration_tests/tidb-cdc-sink/create_mv.sql
+++ b/integration_tests/tidb-cdc-sink/create_mv.sql
@@ -30,12 +30,3 @@ FROM
     datatype
 GROUP BY
     c0_boolean;
-
-CREATE SINK hot_hashtags_sink FROM hot_hashtags
-WITH (
-   connector='jdbc',
-   jdbc.url='jdbc:mysql://tidb:4000/test?user=root&password=',
-   table.name='hot_hashtags',
-   type='upsert',
-   primary_key='window_start,hashtag'
-);
diff --git a/integration_tests/tidb-cdc-sink/create_sink.sql b/integration_tests/tidb-cdc-sink/create_sink.sql
new file mode 100644
index 0000000000000..7c7726ad8120f
--- /dev/null
+++ b/integration_tests/tidb-cdc-sink/create_sink.sql
@@ -0,0 +1,8 @@
+CREATE SINK hot_hashtags_sink FROM hot_hashtags
+WITH (
+   connector='jdbc',
+   jdbc.url='jdbc:mysql://tidb:4000/test?user=root&password=',
+   table.name='hot_hashtags',
+   type='upsert',
+   primary_key='window_start,hashtag'
+);
diff --git a/integration_tests/tidb-cdc-sink/docker-compose.yml b/integration_tests/tidb-cdc-sink/docker-compose.yml
index 70481ab6dbb5c..5d756ba15ffff 100644
--- a/integration_tests/tidb-cdc-sink/docker-compose.yml
+++ b/integration_tests/tidb-cdc-sink/docker-compose.yml
@@ -190,20 +190,13 @@ services:
     restart: always
     container_name: datagen
 
-  init_tidb:
-    image: mysql:8.0
-    depends_on:
-      - tidb
-    command:
-      - /bin/sh
-      - -c
-      - "mysql --password= -h tidb --port 4000 -u root test < tidb_create_tables.sql &&
-         sleep 10 &&
-         mysql --password= -h tidb --port 4000 -u root test < tidb_prepare.sql"
+  mysql:
+    image: mysql:latest
+    command: tail -f /dev/null
     volumes:
       - "./tidb_create_tables.sql:/tidb_create_tables.sql"
       - "./tidb_prepare.sql:/tidb_prepare.sql"
-    container_name: init_tidb
+    container_name: mysql
     restart: on-failure
 
 volumes:
@@ -219,3 +212,4 @@ volumes:
     external: false
   message_queue:
     external: false
+name: risingwave-compose
diff --git a/integration_tests/tidb-cdc-sink/prepare.sh b/integration_tests/tidb-cdc-sink/prepare.sh
new file mode 100755
index 0000000000000..e60363e232e06
--- /dev/null
+++ b/integration_tests/tidb-cdc-sink/prepare.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+set -euo pipefail
+
+docker compose exec mysql bash -c "mysql --password= -h tidb --port 4000 -u root test < tidb_create_tables.sql"
+
+docker compose exec mysql bash -c "mysql --password= -h tidb --port 4000 -u root test < tidb_prepare.sql"
+
+sleep 15
diff --git a/integration_tests/tidb-cdc-sink/sink_check.py b/integration_tests/tidb-cdc-sink/sink_check.py
new file mode 100644
index 0000000000000..19185d950cc01
--- /dev/null
+++ b/integration_tests/tidb-cdc-sink/sink_check.py
@@ -0,0 +1,22 @@
+import sys
+import subprocess
+
+relations = ['hot_hashtags']
+
+failed_cases = []
+for rel in relations:
+    sql = f'SELECT COUNT(*) FROM {rel};'
+    command = f'mysql --password= -h tidb --port 4000 -u root test -e "{sql}"'
+    output = subprocess.check_output(
+        ["docker", "compose", "exec", "mysql", "bash", "-c", command])
+    # output:
+    # COUNT(*)
+    # 0
+    rows = int(output.decode('utf-8').split('\n')[1])
+    print(f"{rows} rows in {rel}")
+    if rows < 1:
+        failed_cases.append(rel)
+
+if len(failed_cases) != 0:
+    print(f"Data check failed for case {failed_cases}")
+    sys.exit(1)
diff --git a/proto/hummock.proto b/proto/hummock.proto
index 3d3a831c1c06a..df582cc491ae3 100644
--- a/proto/hummock.proto
+++ b/proto/hummock.proto
@@ -116,6 +116,11 @@ message VnodeWatermark {
   common.Buffer vnode_bitmap = 2;
 }
 
+// Table watermark is a lighter weight range delete introduced in
+// https://github.com/risingwavelabs/risingwave/issues/13148
+// It means the lowest (or highest when `is_ascending` is false) visible
+// keys in the table within a vnode. Keys lower (or higher) than the
+// table watermark is invisible and will be cleaned in later compaction.
 message TableWatermarks {
   message EpochNewWatermarks {
     repeated VnodeWatermark watermarks = 1;
@@ -145,7 +150,7 @@ message HummockVersion {
   // Snapshots with epoch less than the safe epoch have been GCed.
   // Reads against such an epoch will fail.
   uint64 safe_epoch = 4;
-  map<uint64, TableWatermarks> table_watermarks = 5;
+  map<uint32, TableWatermarks> table_watermarks = 5;
 }
 
 message HummockVersionDelta {
@@ -162,7 +167,7 @@ message HummockVersionDelta {
   uint64 safe_epoch = 5;
   bool trivial_move = 6;
   repeated uint64 gc_object_ids = 7;
-  map<uint64, TableWatermarks> new_table_watermarks = 8;
+  map<uint32, TableWatermarks> new_table_watermarks = 8;
 }
 
 message HummockVersionDeltas {
@@ -337,6 +342,9 @@ message CompactTask {
   // Deprecated. use table_vnode_partition instead;
   uint32 split_weight_by_vnode = 22 [deprecated = true];
   map<uint32, uint32> table_vnode_partition = 23;
+  // The table watermark of any table id. In compaction we only use the table watermarks on safe epoch,
+  // so we only need to include the table watermarks on safe epoch to reduce the size of metadata.
+  map<uint32, TableWatermarks> table_watermarks = 24;
 }
 
 message LevelHandler {
diff --git a/proto/stream_plan.proto b/proto/stream_plan.proto
index c97a0f2d8406e..65b3a10db8b68 100644
--- a/proto/stream_plan.proto
+++ b/proto/stream_plan.proto
@@ -495,6 +495,9 @@ enum StreamScanType {
 
   // ChainExecutor with upstream_only = true
   STREAM_SCAN_TYPE_UPSTREAM_ONLY = 4;
+
+  // ArrangementBackfillExecutor
+  STREAM_SCAN_TYPE_ARRANGEMENT_BACKFILL = 5;
 }
 
 // StreamScanNode reads data from upstream table first, and then pass all events to downstream.
@@ -524,6 +527,7 @@ message StreamScanNode {
   catalog.Table state_table = 5;
 
   // The upstream materialized view info used by backfill.
+  // Used iff `ChainType::Backfill`.
   plan_common.StorageTableDesc table_desc = 7;
 
   // The rate limit for the stream scan node.
@@ -531,6 +535,10 @@ message StreamScanNode {
 
   // Snapshot read every N barriers
   uint32 snapshot_read_barrier_interval = 9 [deprecated = true];
+
+  // The state table used by ArrangementBackfill to replicate upstream mview's state table.
+  // Used iff `ChainType::ArrangementBackfill`.
+  catalog.Table arrangement_table = 10;
 }
 
 message StreamCdcScanNode {
diff --git a/proto/stream_service.proto b/proto/stream_service.proto
index 5d82452dc0ca9..462f5ff0256a6 100644
--- a/proto/stream_service.proto
+++ b/proto/stream_service.proto
@@ -87,7 +87,7 @@ message BarrierCompleteResponse {
   }
   repeated GroupedSstableInfo synced_sstables = 4;
   uint32 worker_id = 5;
-  map<uint64, hummock.TableWatermarks> table_watermarks = 6;
+  map<uint32, hummock.TableWatermarks> table_watermarks = 6;
 }
 
 // Before starting streaming, the leader node broadcast the actor-host table to needed workers.
diff --git a/src/common/src/array/data_chunk.rs b/src/common/src/array/data_chunk.rs
index fff5efc22d1f8..90c2560cadcb2 100644
--- a/src/common/src/array/data_chunk.rs
+++ b/src/common/src/array/data_chunk.rs
@@ -34,8 +34,9 @@ use crate::field_generator::{FieldGeneratorImpl, VarcharProperty};
 use crate::hash::HashCode;
 use crate::row::Row;
 use crate::types::{DataType, DatumRef, StructType, ToOwnedDatum, ToText};
+use crate::util::chunk_coalesce::DataChunkBuilder;
 use crate::util::hash_util::finalize_hashers;
-use crate::util::iter_util::{ZipEqDebug, ZipEqFast};
+use crate::util::iter_util::ZipEqFast;
 use crate::util::value_encoding::{
     estimate_serialize_datum_size, serialize_datum_into, try_get_exact_serialize_datum_size,
     ValueRowSerializer,
@@ -95,23 +96,24 @@ impl DataChunk {
     }
 
     /// Build a `DataChunk` with rows.
+    ///
+    /// Panics if the `rows` is empty.
+    ///
+    /// Should prefer using [`DataChunkBuilder`] instead to avoid unnecessary allocation
+    /// of rows.
     pub fn from_rows(rows: &[impl Row], data_types: &[DataType]) -> Self {
-        let mut array_builders = data_types
-            .iter()
-            .map(|data_type| data_type.create_array_builder(1))
-            .collect::<Vec<_>>();
+        // `append_one_row` will cause the builder to finish immediately once capacity is met.
+        // Hence, we allocate an extra row here, to avoid the builder finishing prematurely.
+        // This just makes the code cleaner, since we can loop through all rows, and consume it finally.
+        // TODO: introduce `new_unlimited` to decouple memory reservation from builder capacity.
+        let mut builder = DataChunkBuilder::new(data_types.to_vec(), rows.len() + 1);
 
         for row in rows {
-            for (datum, builder) in row.iter().zip_eq_debug(array_builders.iter_mut()) {
-                builder.append(datum);
-            }
+            let none = builder.append_one_row(row);
+            debug_assert!(none.is_none());
         }
 
-        let new_columns = array_builders
-            .into_iter()
-            .map(|builder| builder.finish().into())
-            .collect::<Vec<_>>();
-        DataChunk::new(new_columns, rows.len())
+        builder.consume_all().expect("chunk should not be empty")
     }
 
     /// Return the next visible row index on or after `row_idx`.
@@ -322,83 +324,24 @@ impl DataChunk {
     /// `rechunk` creates a new vector of data chunk whose size is `each_size_limit`.
     /// When the total cardinality of all the chunks is not evenly divided by the `each_size_limit`,
     /// the last new chunk will be the remainder.
-    ///
-    /// Currently, `rechunk` would ignore visibility map. May or may not support it later depending
-    /// on the demand
     pub fn rechunk(chunks: &[DataChunk], each_size_limit: usize) -> ArrayResult<Vec<DataChunk>> {
-        assert!(each_size_limit > 0);
-        // Corner case: one of the `chunks` may have 0 length
-        // remove the chunks with zero physical length here,
-        // or skip them in the loop below
-        let chunks = chunks
-            .iter()
-            .filter(|chunk| chunk.capacity() != 0)
-            .collect::<Vec<_>>();
-        if chunks.is_empty() {
+        let Some(data_types) = chunks.first().map(|c| c.data_types()) else {
             return Ok(Vec::new());
-        }
+        };
 
-        let mut total_capacity = chunks.iter().map(|chunk| chunk.capacity()).sum();
-        let num_chunks = (total_capacity + each_size_limit - 1) / each_size_limit;
+        let mut builder = DataChunkBuilder::new(data_types, each_size_limit);
+        let mut outputs = Vec::new();
 
-        // the idx of `chunks`
-        let mut chunk_idx = 0;
-        // the row idx of `chunks[chunk_idx]`
-        let mut start_row_idx = 0;
-        // how many rows does this new chunk need?
-        let mut new_chunk_require = std::cmp::min(total_capacity, each_size_limit);
-        let mut array_builders: Vec<ArrayBuilderImpl> = chunks[0]
-            .columns
-            .iter()
-            .map(|col| col.create_builder(new_chunk_require))
-            .collect();
-        let mut array_len = new_chunk_require;
-        let mut new_chunks = Vec::with_capacity(num_chunks);
-        while chunk_idx < chunks.len() {
-            let capacity = chunks[chunk_idx].capacity();
-            let num_rows_left = capacity - start_row_idx;
-            let actual_acquire = std::cmp::min(new_chunk_require, num_rows_left);
-            let end_row_idx = start_row_idx + actual_acquire - 1;
-            array_builders
-                .iter_mut()
-                .zip_eq_fast(chunks[chunk_idx].columns())
-                .for_each(|(builder, column)| {
-                    let mut array_builder = column.create_builder(end_row_idx - start_row_idx + 1);
-                    for row_idx in start_row_idx..=end_row_idx {
-                        array_builder.append(column.value_at(row_idx));
-                    }
-                    builder.append_array(&array_builder.finish());
-                });
-            // since `end_row_idx` is inclusive, exclude it for the next round.
-            start_row_idx = end_row_idx + 1;
-            // if the current `chunks[chunk_idx] is used up, move to the next one
-            if start_row_idx == capacity {
-                chunk_idx += 1;
-                start_row_idx = 0;
-            }
-            new_chunk_require -= actual_acquire;
-            total_capacity -= actual_acquire;
-            // a new chunk receives enough rows, finalize it
-            if new_chunk_require == 0 {
-                let new_columns: Vec<ArrayRef> = array_builders
-                    .drain(..)
-                    .map(|builder| builder.finish().into())
-                    .collect();
-
-                array_builders = new_columns
-                    .iter()
-                    .map(|col_type| col_type.create_builder(new_chunk_require))
-                    .collect();
-
-                let data_chunk = DataChunk::new(new_columns, array_len);
-                new_chunks.push(data_chunk);
-
-                new_chunk_require = std::cmp::min(total_capacity, each_size_limit);
-                array_len = new_chunk_require;
+        for chunk in chunks {
+            for output in builder.append_chunk(chunk.clone()) {
+                outputs.push(output);
             }
         }
+        if let Some(output) = builder.consume_all() {
+            outputs.push(output);
+        }
 
-        Ok(new_chunks)
+        Ok(outputs)
     }
 
     /// Compute hash values for each row.
diff --git a/src/common/src/array/mod.rs b/src/common/src/array/mod.rs
index 086f7ffd5cc9d..80d84e5245d2d 100644
--- a/src/common/src/array/mod.rs
+++ b/src/common/src/array/mod.rs
@@ -32,6 +32,7 @@ mod num256_array;
 mod primitive_array;
 mod proto_reader;
 pub mod stream_chunk;
+pub mod stream_chunk_builder;
 mod stream_chunk_iter;
 pub mod stream_record;
 pub mod struct_array;
diff --git a/src/common/src/array/stream_chunk.rs b/src/common/src/array/stream_chunk.rs
index e024d22ec5172..192d4adfe7d3e 100644
--- a/src/common/src/array/stream_chunk.rs
+++ b/src/common/src/array/stream_chunk.rs
@@ -24,6 +24,7 @@ use rand::prelude::SmallRng;
 use rand::{Rng, SeedableRng};
 use risingwave_pb::data::{PbOp, PbStreamChunk};
 
+use super::stream_chunk_builder::StreamChunkBuilder;
 use super::{ArrayImpl, ArrayRef, ArrayResult, DataChunkTestExt, RowRef};
 use crate::array::DataChunk;
 use crate::buffer::{Bitmap, BitmapBuilder};
@@ -32,7 +33,7 @@ use crate::estimate_size::EstimateSize;
 use crate::field_generator::VarcharProperty;
 use crate::row::Row;
 use crate::types::{DataType, DefaultOrdered, ToText};
-use crate::util::iter_util::ZipEqDebug;
+
 /// `Op` represents three operations in `StreamChunk`.
 ///
 /// `UpdateDelete` and `UpdateInsert` are semantically equivalent to `Delete` and `Insert`
@@ -125,26 +126,24 @@ impl StreamChunk {
     }
 
     /// Build a `StreamChunk` from rows.
-    // TODO: introducing something like `StreamChunkBuilder` maybe better.
+    ///
+    /// Panics if the `rows` is empty.
+    ///
+    /// Should prefer using [`StreamChunkBuilder`] instead to avoid unnecessary
+    /// allocation of rows.
     pub fn from_rows(rows: &[(Op, impl Row)], data_types: &[DataType]) -> Self {
-        let mut array_builders = data_types
-            .iter()
-            .map(|data_type| data_type.create_array_builder(rows.len()))
-            .collect::<Vec<_>>();
-        let mut ops = vec![];
+        // `append_row` will cause the builder to finish immediately once capacity is met.
+        // Hence, we allocate an extra row here, to avoid the builder finishing prematurely.
+        // This just makes the code cleaner, since we can loop through all rows, and consume it finally.
+        // TODO: introduce `new_unlimited` to decouple memory reservation from builder capacity.
+        let mut builder = StreamChunkBuilder::new(rows.len() + 1, data_types.to_vec());
 
         for (op, row) in rows {
-            ops.push(*op);
-            for (datum, builder) in row.iter().zip_eq_debug(array_builders.iter_mut()) {
-                builder.append(datum);
-            }
+            let none = builder.append_row(*op, row);
+            debug_assert!(none.is_none());
         }
 
-        let new_columns = array_builders
-            .into_iter()
-            .map(|builder| builder.finish().into())
-            .collect::<Vec<_>>();
-        StreamChunk::new(ops, new_columns)
+        builder.take().expect("chunk should not be empty")
     }
 
     /// Get the reference of the underlying data chunk.
@@ -182,33 +181,20 @@ impl StreamChunk {
     /// For consecutive `UpdateDelete` and `UpdateInsert`, they will be kept in one chunk.
     /// As a result, some chunks may have `size + 1` rows.
     pub fn split(&self, size: usize) -> Vec<Self> {
-        let data_types = self.data_types();
-        let mut rows = Vec::with_capacity(size + 1);
-        let mut results = vec![];
-
-        let mut iter = self.rows();
-        while let Some(row) = iter.next() {
-            rows.push(row);
-            if rows.len() == size {
-                // If the last row is UpdateDelete, also include the UpdateInsert.
-                if rows.last().unwrap().0 == Op::UpdateDelete {
-                    let next_row = iter
-                        .next()
-                        .expect("UpdateDelete should have UpdateInsert after");
-                    assert_eq!(next_row.0, Op::UpdateInsert);
-                    rows.push(next_row);
-                }
-                let chunk = Self::from_rows(&rows, &data_types);
-                results.push(chunk);
-                rows.clear();
+        let mut builder = StreamChunkBuilder::new(size, self.data_types());
+        let mut outputs = Vec::new();
+
+        // TODO: directly append the chunk.
+        for (op, row) in self.rows() {
+            if let Some(chunk) = builder.append_row(op, row) {
+                outputs.push(chunk);
             }
         }
-
-        if !rows.is_empty() {
-            let chunk = StreamChunk::from_rows(&rows, &data_types);
-            results.push(chunk);
+        if let Some(output) = builder.take() {
+            outputs.push(output);
         }
-        results
+
+        outputs
     }
 
     pub fn into_parts(self) -> (DataChunk, Arc<[Op]>) {
@@ -534,38 +520,10 @@ impl StreamChunkMut {
         }
     }
 }
-/// Test utilities for [`StreamChunk`].
-pub trait StreamChunkTestExt: Sized {
-    fn from_pretty(s: &str) -> Self;
-
-    /// Validate the `StreamChunk` layout.
-    fn valid(&self) -> bool;
-
-    /// Concatenate multiple `StreamChunk` into one.
-    fn concat(chunks: Vec<Self>) -> Self;
-
-    /// Sort rows.
-    fn sort_rows(self) -> Self;
-
-    /// Generate stream chunks
-    fn gen_stream_chunks(
-        num_of_chunks: usize,
-        chunk_size: usize,
-        data_types: &[DataType],
-        varchar_properties: &VarcharProperty,
-    ) -> Vec<Self>;
-
-    fn gen_stream_chunks_inner(
-        num_of_chunks: usize,
-        chunk_size: usize,
-        data_types: &[DataType],
-        varchar_properties: &VarcharProperty,
-        visibility_percent: f64, // % of rows that are visible
-        inserts_percent: f64,
-    ) -> Vec<Self>;
-}
 
-impl StreamChunkTestExt for StreamChunk {
+/// Test utilities for [`StreamChunk`].
+#[easy_ext::ext(StreamChunkTestExt)]
+impl StreamChunk {
     /// Parse a chunk from string.
     ///
     /// See also [`DataChunkTestExt::from_pretty`].
@@ -606,7 +564,7 @@ impl StreamChunkTestExt for StreamChunk {
     /// //   x[]: array of x
     /// // <i,f>: struct
     /// ```
-    fn from_pretty(s: &str) -> Self {
+    pub fn from_pretty(s: &str) -> Self {
         let mut chunk_str = String::new();
         let mut ops = vec![];
 
@@ -647,34 +605,39 @@ impl StreamChunkTestExt for StreamChunk {
         }
     }
 
-    fn valid(&self) -> bool {
+    /// Validate the `StreamChunk` layout.
+    pub fn valid(&self) -> bool {
         let len = self.ops.len();
         let data = &self.data;
         data.visibility().len() == len && data.columns().iter().all(|col| col.len() == len)
     }
 
-    fn concat(chunks: Vec<StreamChunk>) -> StreamChunk {
-        assert!(!chunks.is_empty());
-        let mut ops = vec![];
-        let mut data_chunks = vec![];
-        let mut capacity = 0;
+    /// Concatenate multiple `StreamChunk` into one.
+    ///
+    /// Panics if `chunks` is empty.
+    pub fn concat(chunks: Vec<StreamChunk>) -> StreamChunk {
+        let data_types = chunks[0].data_types();
+        let size = chunks.iter().map(|c| c.cardinality()).sum::<usize>();
+
+        // `append_row` will cause the builder to finish immediately once capacity is met.
+        // Hence, we allocate an extra row here, to avoid the builder finishing prematurely.
+        // This just makes the code cleaner, since we can loop through all rows, and consume it finally.
+        // TODO: introduce `new_unlimited` to decouple memory reservation from builder capacity.
+        let mut builder = StreamChunkBuilder::new(size + 1, data_types);
+
         for chunk in chunks {
-            capacity += chunk.capacity();
-            ops.extend(chunk.ops.iter());
-            data_chunks.push(chunk.data);
-        }
-        let data = DataChunk::rechunk(&data_chunks, capacity)
-            .unwrap()
-            .into_iter()
-            .next()
-            .unwrap();
-        StreamChunk {
-            ops: ops.into(),
-            data,
+            // TODO: directly append chunks.
+            for (op, row) in chunk.rows() {
+                let none = builder.append_row(op, row);
+                debug_assert!(none.is_none());
+            }
         }
+
+        builder.take().expect("chunk should not be empty")
     }
 
-    fn sort_rows(self) -> Self {
+    /// Sort rows.
+    pub fn sort_rows(self) -> Self {
         if self.capacity() == 0 {
             return self;
         }
@@ -693,7 +656,7 @@ impl StreamChunkTestExt for StreamChunk {
     /// Generate `num_of_chunks` data chunks with type `data_types`,
     /// where each data chunk has cardinality of `chunk_size`.
     /// TODO(kwannoel): Generate different types of op, different vis.
-    fn gen_stream_chunks(
+    pub fn gen_stream_chunks(
         num_of_chunks: usize,
         chunk_size: usize,
         data_types: &[DataType],
@@ -709,7 +672,7 @@ impl StreamChunkTestExt for StreamChunk {
         )
     }
 
-    fn gen_stream_chunks_inner(
+    pub fn gen_stream_chunks_inner(
         num_of_chunks: usize,
         chunk_size: usize,
         data_types: &[DataType],
diff --git a/src/common/src/array/stream_chunk_builder.rs b/src/common/src/array/stream_chunk_builder.rs
new file mode 100644
index 0000000000000..f9e7001bed8e6
--- /dev/null
+++ b/src/common/src/array/stream_chunk_builder.rs
@@ -0,0 +1,146 @@
+// Copyright 2023 RisingWave Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::array::stream_record::Record;
+use crate::array::{ArrayBuilderImpl, Op, StreamChunk};
+use crate::row::Row;
+use crate::types::{DataType, DatumRef};
+use crate::util::iter_util::ZipEqFast;
+
+/// Build stream chunks with fixed chunk size from rows or records.
+pub struct StreamChunkBuilder {
+    /// operations in the data chunk to build
+    ops: Vec<Op>,
+
+    /// arrays in the data chunk to build
+    column_builders: Vec<ArrayBuilderImpl>,
+
+    /// Data types of columns
+    data_types: Vec<DataType>,
+
+    /// Maximum capacity of column builder
+    capacity: usize,
+
+    /// Size of column builder
+    size: usize,
+}
+
+impl Drop for StreamChunkBuilder {
+    fn drop(&mut self) {
+        // Possible to fail when async task gets cancelled.
+        if self.size != 0 {
+            tracing::warn!(
+                remaining = self.size,
+                "dropping non-empty stream chunk builder"
+            );
+        }
+    }
+}
+
+impl StreamChunkBuilder {
+    pub fn new(chunk_size: usize, data_types: Vec<DataType>) -> Self {
+        assert!(chunk_size > 0);
+
+        let ops = Vec::with_capacity(chunk_size);
+        let column_builders = data_types
+            .iter()
+            .map(|datatype| datatype.create_array_builder(chunk_size))
+            .collect();
+        Self {
+            ops,
+            column_builders,
+            data_types,
+            capacity: chunk_size,
+            size: 0,
+        }
+    }
+
+    /// Increase chunk size
+    ///
+    /// A [`StreamChunk`] will be returned when `size == capacity`
+    #[must_use]
+    fn inc_size(&mut self) -> Option<StreamChunk> {
+        self.size += 1;
+
+        // Take a chunk when capacity is exceeded. Splitting `UpdateDelete` and `UpdateInsert`
+        // should be avoided, so when the last one is `UpdateDelete`, we delay the chunk until
+        // `UpdateInsert` comes. This means the output chunk size may exceed the given `chunk_size`,
+        // and theoretically at most `chunk_size + 1` if inputs are consistent.
+        if self.size >= self.capacity && self.ops[self.ops.len() - 1] != Op::UpdateDelete {
+            self.take()
+        } else {
+            None
+        }
+    }
+
+    /// Append an iterator of output index and datum to the builder, return a chunk if the builder
+    /// is full.
+    ///
+    /// Note: the caller must ensure that each column occurs exactly once in `iter`.
+    #[must_use]
+    pub fn append_iter<'a>(
+        &mut self,
+        op: Op,
+        iter: impl IntoIterator<Item = (usize, DatumRef<'a>)>,
+    ) -> Option<StreamChunk> {
+        self.ops.push(op);
+        for (i, datum) in iter {
+            self.column_builders[i].append(datum);
+        }
+        self.inc_size()
+    }
+
+    /// Append a row to the builder, return a chunk if the builder is full.
+    #[must_use]
+    pub fn append_row(&mut self, op: Op, row: impl Row) -> Option<StreamChunk> {
+        self.append_iter(op, row.iter().enumerate())
+    }
+
+    /// Append a record to the builder, return a chunk if the builder is full.
+    #[must_use]
+    pub fn append_record(&mut self, record: Record<impl Row>) -> Option<StreamChunk> {
+        match record {
+            Record::Insert { new_row } => self.append_row(Op::Insert, new_row),
+            Record::Delete { old_row } => self.append_row(Op::Delete, old_row),
+            Record::Update { old_row, new_row } => {
+                let none = self.append_row(Op::UpdateDelete, old_row);
+                debug_assert!(none.is_none());
+                self.append_row(Op::UpdateInsert, new_row)
+            }
+        }
+    }
+
+    #[must_use]
+    pub fn take(&mut self) -> Option<StreamChunk> {
+        if self.size == 0 {
+            return None;
+        }
+
+        self.size = 0;
+        let new_columns = self
+            .column_builders
+            .iter_mut()
+            .zip_eq_fast(&self.data_types)
+            .map(|(builder, datatype)| {
+                std::mem::replace(builder, datatype.create_array_builder(self.capacity)).finish()
+            })
+            .map(Into::into)
+            .collect::<Vec<_>>();
+
+        Some(StreamChunk::new(
+            std::mem::replace(&mut self.ops, Vec::with_capacity(self.capacity)),
+            new_columns,
+        ))
+    }
+}
diff --git a/src/common/src/array/stream_record.rs b/src/common/src/array/stream_record.rs
index f9b87adeccf63..1c7b7062962cf 100644
--- a/src/common/src/array/stream_record.rs
+++ b/src/common/src/array/stream_record.rs
@@ -61,16 +61,13 @@ impl<R: Row> Record<R> {
     pub fn to_stream_chunk(&self, data_types: &[DataType]) -> StreamChunk {
         match self {
             Record::Insert { new_row } => {
-                StreamChunk::from_rows(&[(Op::Insert, new_row.to_owned_row())], data_types)
+                StreamChunk::from_rows(&[(Op::Insert, new_row)], data_types)
             }
             Record::Delete { old_row } => {
-                StreamChunk::from_rows(&[(Op::Delete, old_row.to_owned_row())], data_types)
+                StreamChunk::from_rows(&[(Op::Delete, old_row)], data_types)
             }
             Record::Update { old_row, new_row } => StreamChunk::from_rows(
-                &[
-                    (Op::UpdateDelete, old_row.to_owned_row()),
-                    (Op::UpdateInsert, new_row.to_owned_row()),
-                ],
+                &[(Op::UpdateDelete, old_row), (Op::UpdateInsert, new_row)],
                 data_types,
             ),
         }
diff --git a/src/common/src/util/chunk_coalesce.rs b/src/common/src/util/chunk_coalesce.rs
index 3bd56b19e434d..cb7845816bfeb 100644
--- a/src/common/src/util/chunk_coalesce.rs
+++ b/src/common/src/util/chunk_coalesce.rs
@@ -39,6 +39,8 @@ pub struct DataChunkBuilder {
 
 impl DataChunkBuilder {
     pub fn new(data_types: Vec<DataType>, batch_size: usize) -> Self {
+        assert!(batch_size > 0);
+
         Self {
             data_types,
             batch_size,
@@ -124,7 +126,8 @@ impl DataChunkBuilder {
     pub fn append_chunk(&mut self, data_chunk: DataChunk) -> AppendDataChunk<'_> {
         AppendDataChunk {
             builder: self,
-            remaining: Some(SlicedDataChunk::new_checked(data_chunk)),
+            remaining: (data_chunk.capacity() > 0) // defensive check for empty chunk
+                .then(|| SlicedDataChunk::new_checked(data_chunk)),
         }
     }
 
diff --git a/src/compute/src/rpc/service/stream_service.rs b/src/compute/src/rpc/service/stream_service.rs
index 4dceda1719f81..4e1765e32824f 100644
--- a/src/compute/src/rpc/service/stream_service.rs
+++ b/src/compute/src/rpc/service/stream_service.rs
@@ -247,7 +247,7 @@ impl StreamService for StreamServiceImpl {
             worker_id: self.env.worker_id(),
             table_watermarks: table_watermarks
                 .into_iter()
-                .map(|(key, value)| (key.table_id as u64, value.to_protobuf()))
+                .map(|(key, value)| (key.table_id, value.to_protobuf()))
                 .collect(),
         }))
     }
diff --git a/src/frontend/planner_test/tests/testdata/input/emit_on_window_close.yaml b/src/frontend/planner_test/tests/testdata/input/emit_on_window_close.yaml
index 158e35850262a..bb917afc60a29 100644
--- a/src/frontend/planner_test/tests/testdata/input/emit_on_window_close.yaml
+++ b/src/frontend/planner_test/tests/testdata/input/emit_on_window_close.yaml
@@ -40,3 +40,19 @@
     WITH (connector = 'blackhole');
   expected_outputs:
     - explain_output
+- sql: |
+    CREATE table s1 (id int, value int, ts TIMESTAMP, WATERMARK FOR ts AS ts - INTERVAL '20' SECOND ) append only;
+    CREATE table s2 (id int, value int, ts TIMESTAMP, WATERMARK FOR ts AS ts - INTERVAL '20' SECOND) append only;
+    select *, count(*) over (partition by value2 order by ts2) from (
+      SELECT s1.id AS id1,
+      s1.value AS value1,
+      s2.id AS id2,
+      s2.value AS value2,
+      s1.ts as ts1,
+      s2.ts as ts2
+      FROM s1 JOIN s2
+      ON s1.id = s2.id and s1.ts >= s2.ts   and s1.ts - INTERVAL '1' MINUTE <= s2.ts
+    );
+  expected_outputs:
+    - eowc_stream_plan
+    - stream_plan
\ No newline at end of file
diff --git a/src/frontend/planner_test/tests/testdata/output/emit_on_window_close.yaml b/src/frontend/planner_test/tests/testdata/output/emit_on_window_close.yaml
index cd3019382bd66..acdc201f0c6fb 100644
--- a/src/frontend/planner_test/tests/testdata/output/emit_on_window_close.yaml
+++ b/src/frontend/planner_test/tests/testdata/output/emit_on_window_close.yaml
@@ -215,3 +215,37 @@
       └─StreamEowcSort { sort_column: t.tm }
         └─StreamExchange { dist: HashShard(t.bar) }
           └─StreamTableScan { table: t, columns: [tm, foo, bar, _row_id] }
+- sql: |
+    CREATE table s1 (id int, value int, ts TIMESTAMP, WATERMARK FOR ts AS ts - INTERVAL '20' SECOND ) append only;
+    CREATE table s2 (id int, value int, ts TIMESTAMP, WATERMARK FOR ts AS ts - INTERVAL '20' SECOND) append only;
+    select *, count(*) over (partition by value2 order by ts2) from (
+      SELECT s1.id AS id1,
+      s1.value AS value1,
+      s2.id AS id2,
+      s2.value AS value2,
+      s1.ts as ts1,
+      s2.ts as ts2
+      FROM s1 JOIN s2
+      ON s1.id = s2.id and s1.ts >= s2.ts   and s1.ts - INTERVAL '1' MINUTE <= s2.ts
+    );
+  stream_plan: |-
+    StreamMaterialize { columns: [id1, value1, id2, value2, ts1, ts2, s1._row_id(hidden), s2._row_id(hidden), count], stream_key: [s1._row_id, s2._row_id, id1, value2], pk_columns: [s1._row_id, s2._row_id, id1, value2], pk_conflict: NoCheck }
+    └─StreamOverWindow { window_functions: [count() OVER(PARTITION BY s2.value ORDER BY s2.ts ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)] }
+      └─StreamExchange { dist: HashShard(s2.value) }
+        └─StreamHashJoin [interval, append_only] { type: Inner, predicate: s1.id = s2.id AND (s1.ts >= s2.ts) AND ($expr1 <= s2.ts), conditions_to_clean_left_state_table: (s1.ts >= s2.ts), conditions_to_clean_right_state_table: ($expr1 <= s2.ts), output_watermarks: [s1.ts, s2.ts], output: [s1.id, s1.value, s2.id, s2.value, s1.ts, s2.ts, s1._row_id, s2._row_id] }
+          ├─StreamExchange { dist: HashShard(s1.id) }
+          │ └─StreamProject { exprs: [s1.id, s1.value, s1.ts, (s1.ts - '00:01:00':Interval) as $expr1, s1._row_id], output_watermarks: [s1.ts, $expr1] }
+          │   └─StreamTableScan { table: s1, columns: [s1.id, s1.value, s1.ts, s1._row_id], pk: [s1._row_id], dist: UpstreamHashShard(s1._row_id) }
+          └─StreamExchange { dist: HashShard(s2.id) }
+            └─StreamTableScan { table: s2, columns: [s2.id, s2.value, s2.ts, s2._row_id], pk: [s2._row_id], dist: UpstreamHashShard(s2._row_id) }
+  eowc_stream_plan: |-
+    StreamMaterialize { columns: [id1, value1, id2, value2, ts1, ts2, s1._row_id(hidden), s2._row_id(hidden), count], stream_key: [s1._row_id, s2._row_id, id1, value2], pk_columns: [s1._row_id, s2._row_id, id1, value2], pk_conflict: NoCheck }
+    └─StreamEowcOverWindow { window_functions: [count() OVER(PARTITION BY s2.value ORDER BY s2.ts ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)] }
+      └─StreamEowcSort { sort_column: s2.ts }
+        └─StreamExchange { dist: HashShard(s2.value) }
+          └─StreamHashJoin [interval, append_only] { type: Inner, predicate: s1.id = s2.id AND (s1.ts >= s2.ts) AND ($expr1 <= s2.ts), conditions_to_clean_left_state_table: (s1.ts >= s2.ts), conditions_to_clean_right_state_table: ($expr1 <= s2.ts), output_watermarks: [s1.ts, s2.ts], output: [s1.id, s1.value, s2.id, s2.value, s1.ts, s2.ts, s1._row_id, s2._row_id] }
+            ├─StreamExchange { dist: HashShard(s1.id) }
+            │ └─StreamProject { exprs: [s1.id, s1.value, s1.ts, (s1.ts - '00:01:00':Interval) as $expr1, s1._row_id], output_watermarks: [s1.ts, $expr1] }
+            │   └─StreamTableScan { table: s1, columns: [s1.id, s1.value, s1.ts, s1._row_id], pk: [s1._row_id], dist: UpstreamHashShard(s1._row_id) }
+            └─StreamExchange { dist: HashShard(s2.id) }
+              └─StreamTableScan { table: s2, columns: [s2.id, s2.value, s2.ts, s2._row_id], pk: [s2._row_id], dist: UpstreamHashShard(s2._row_id) }
diff --git a/src/frontend/planner_test/tests/testdata/output/nexmark_watermark.yaml b/src/frontend/planner_test/tests/testdata/output/nexmark_watermark.yaml
index 53df7f872633e..ba5b6198f01db 100644
--- a/src/frontend/planner_test/tests/testdata/output/nexmark_watermark.yaml
+++ b/src/frontend/planner_test/tests/testdata/output/nexmark_watermark.yaml
@@ -1018,9 +1018,9 @@
               └─BatchProject { exprs: [event_type, person, auction, bid, Case((event_type = 0:Int32), Field(person, 6:Int32), (event_type = 1:Int32), Field(auction, 5:Int32), Field(bid, 5:Int32)) as $expr10, _row_id] }
                 └─BatchSource { source: nexmark, columns: [event_type, person, auction, bid, _row_id], filter: (None, None) }
   stream_plan: |-
-    StreamMaterialize { columns: [id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, auction, bidder, price, bid_date_time, _row_id(hidden), _row_id#1(hidden)], stream_key: [id], pk_columns: [id], pk_conflict: NoCheck, watermark_columns: [bid_date_time] }
-    └─StreamGroupTopN [append_only] { order: [$expr12 DESC, $expr1 ASC], limit: 1, offset: 0, group_key: [$expr2], output_watermarks: [$expr1] }
-      └─StreamHashJoin [append_only] { type: Inner, predicate: $expr2 = $expr10 AND ($expr1 >= $expr1) AND ($expr1 <= $expr7), conditions_to_clean_right_state_table: ($expr1 >= $expr1), output_watermarks: [$expr1], output: [$expr2, $expr3, $expr4, $expr5, $expr6, $expr1, $expr7, $expr8, $expr9, $expr10, $expr11, $expr12, $expr1, _row_id, _row_id] }
+    StreamMaterialize { columns: [id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, auction, bidder, price, bid_date_time, _row_id(hidden), _row_id#1(hidden)], stream_key: [id], pk_columns: [id], pk_conflict: NoCheck, watermark_columns: [date_time, bid_date_time] }
+    └─StreamGroupTopN [append_only] { order: [$expr12 DESC, $expr1 ASC], limit: 1, offset: 0, group_key: [$expr2], output_watermarks: [$expr1, $expr1] }
+      └─StreamHashJoin [append_only] { type: Inner, predicate: $expr2 = $expr10 AND ($expr1 >= $expr1) AND ($expr1 <= $expr7), conditions_to_clean_right_state_table: ($expr1 >= $expr1), output_watermarks: [$expr1, $expr1], output: [$expr2, $expr3, $expr4, $expr5, $expr6, $expr1, $expr7, $expr8, $expr9, $expr10, $expr11, $expr12, $expr1, _row_id, _row_id] }
         ├─StreamExchange { dist: HashShard($expr2) }
         │ └─StreamProject { exprs: [Field(auction, 0:Int32) as $expr2, Field(auction, 1:Int32) as $expr3, Field(auction, 2:Int32) as $expr4, Field(auction, 3:Int32) as $expr5, Field(auction, 4:Int32) as $expr6, $expr1, Field(auction, 6:Int32) as $expr7, Field(auction, 7:Int32) as $expr8, Field(auction, 8:Int32) as $expr9, _row_id], output_watermarks: [$expr1] }
         │   └─StreamFilter { predicate: (event_type = 1:Int32) }
@@ -1043,9 +1043,9 @@
                           └─StreamSource { source: nexmark, columns: [event_type, person, auction, bid, _row_id] }
   stream_dist_plan: |+
     Fragment 0
-    StreamMaterialize { columns: [id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, auction, bidder, price, bid_date_time, _row_id(hidden), _row_id#1(hidden)], stream_key: [id], pk_columns: [id], pk_conflict: NoCheck, watermark_columns: [bid_date_time] } { materialized table: 4294967294 }
-    └── StreamGroupTopN [append_only] { order: [$expr12 DESC, $expr1 ASC], limit: 1, offset: 0, group_key: [$expr2], output_watermarks: [$expr1] } { state table: 0 }
-        └── StreamHashJoin [append_only] { type: Inner, predicate: $expr2 = $expr10 AND ($expr1 >= $expr1) AND ($expr1 <= $expr7), conditions_to_clean_right_state_table: ($expr1 >= $expr1), output_watermarks: [$expr1], output: [$expr2, $expr3, $expr4, $expr5, $expr6, $expr1, $expr7, $expr8, $expr9, $expr10, $expr11, $expr12, $expr1, _row_id, _row_id] }
+    StreamMaterialize { columns: [id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, auction, bidder, price, bid_date_time, _row_id(hidden), _row_id#1(hidden)], stream_key: [id], pk_columns: [id], pk_conflict: NoCheck, watermark_columns: [date_time, bid_date_time] } { materialized table: 4294967294 }
+    └── StreamGroupTopN [append_only] { order: [$expr12 DESC, $expr1 ASC], limit: 1, offset: 0, group_key: [$expr2], output_watermarks: [$expr1, $expr1] } { state table: 0 }
+        └── StreamHashJoin [append_only] { type: Inner, predicate: $expr2 = $expr10 AND ($expr1 >= $expr1) AND ($expr1 <= $expr7), conditions_to_clean_right_state_table: ($expr1 >= $expr1), output_watermarks: [$expr1, $expr1], output: [$expr2, $expr3, $expr4, $expr5, $expr6, $expr1, $expr7, $expr8, $expr9, $expr10, $expr11, $expr12, $expr1, _row_id, _row_id] }
             ├── left table: 1
             ├── right table: 3
             ├── left degree table: 2
@@ -1088,10 +1088,10 @@
     Table 4294967294 { columns: [ id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, auction, bidder, price, bid_date_time, _row_id, _row_id#1 ], primary key: [ $0 ASC ], value indices: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 ], distribution key: [ 0 ], read pk prefix len hint: 1 }
 
   eowc_stream_plan: |-
-    StreamMaterialize { columns: [id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, auction, bidder, price, bid_date_time, _row_id(hidden), _row_id#1(hidden)], stream_key: [id], pk_columns: [id], pk_conflict: NoCheck, watermark_columns: [bid_date_time] }
+    StreamMaterialize { columns: [id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, auction, bidder, price, bid_date_time, _row_id(hidden), _row_id#1(hidden)], stream_key: [id], pk_columns: [id], pk_conflict: NoCheck, watermark_columns: [date_time] }
     └─StreamEowcSort { sort_column: $expr1 }
-      └─StreamGroupTopN [append_only] { order: [$expr12 DESC, $expr1 ASC], limit: 1, offset: 0, group_key: [$expr2], output_watermarks: [$expr1] }
-        └─StreamHashJoin [append_only] { type: Inner, predicate: $expr2 = $expr10 AND ($expr1 >= $expr1) AND ($expr1 <= $expr7), conditions_to_clean_right_state_table: ($expr1 >= $expr1), output_watermarks: [$expr1], output: [$expr2, $expr3, $expr4, $expr5, $expr6, $expr1, $expr7, $expr8, $expr9, $expr10, $expr11, $expr12, $expr1, _row_id, _row_id] }
+      └─StreamGroupTopN [append_only] { order: [$expr12 DESC, $expr1 ASC], limit: 1, offset: 0, group_key: [$expr2], output_watermarks: [$expr1, $expr1] }
+        └─StreamHashJoin [append_only] { type: Inner, predicate: $expr2 = $expr10 AND ($expr1 >= $expr1) AND ($expr1 <= $expr7), conditions_to_clean_right_state_table: ($expr1 >= $expr1), output_watermarks: [$expr1, $expr1], output: [$expr2, $expr3, $expr4, $expr5, $expr6, $expr1, $expr7, $expr8, $expr9, $expr10, $expr11, $expr12, $expr1, _row_id, _row_id] }
           ├─StreamExchange { dist: HashShard($expr2) }
           │ └─StreamProject { exprs: [Field(auction, 0:Int32) as $expr2, Field(auction, 1:Int32) as $expr3, Field(auction, 2:Int32) as $expr4, Field(auction, 3:Int32) as $expr5, Field(auction, 4:Int32) as $expr6, $expr1, Field(auction, 6:Int32) as $expr7, Field(auction, 7:Int32) as $expr8, Field(auction, 8:Int32) as $expr9, _row_id], output_watermarks: [$expr1] }
           │   └─StreamFilter { predicate: (event_type = 1:Int32) }
diff --git a/src/frontend/src/catalog/table_catalog.rs b/src/frontend/src/catalog/table_catalog.rs
index c8b7b4ef437e4..63fd9e5496919 100644
--- a/src/frontend/src/catalog/table_catalog.rs
+++ b/src/frontend/src/catalog/table_catalog.rs
@@ -119,8 +119,7 @@ pub struct TableCatalog {
     /// `None`.
     pub row_id_index: Option<usize>,
 
-    /// The column indices which are stored in the state store's value with row-encoding. Currently
-    /// is not supported yet and expected to be `[0..columns.len()]`.
+    /// The column indices which are stored in the state store's value with row-encoding.
     pub value_indices: Vec<usize>,
 
     /// The full `CREATE TABLE` or `CREATE MATERIALIZED VIEW` definition of the table.
diff --git a/src/frontend/src/handler/create_index.rs b/src/frontend/src/handler/create_index.rs
index 006230552ea02..1553a84c1a09b 100644
--- a/src/frontend/src/handler/create_index.rs
+++ b/src/frontend/src/handler/create_index.rs
@@ -14,6 +14,7 @@
 
 use std::collections::{HashMap, HashSet};
 use std::rc::Rc;
+use std::sync::Arc;
 
 use either::Either;
 use fixedbitset::FixedBitSet;
@@ -41,6 +42,7 @@ use crate::optimizer::{OptimizerContext, OptimizerContextRef, PlanRef, PlanRoot}
 use crate::scheduler::streaming_manager::CreatingStreamingJobInfo;
 use crate::session::SessionImpl;
 use crate::stream_fragmenter::build_graph;
+use crate::TableCatalog;
 
 pub(crate) fn gen_create_index_plan(
     session: &SessionImpl,
@@ -182,7 +184,7 @@ pub(crate) fn gen_create_index_plan(
     // Manually assemble the materialization plan for the index MV.
     let materialize = assemble_materialize(
         table_name,
-        table_desc.clone(),
+        table.clone(),
         context,
         index_table_name.clone(),
         &index_columns_ordered_expr,
@@ -308,7 +310,7 @@ fn build_index_item(
 /// `distributed_by_columns_len` to represent distributed by columns
 fn assemble_materialize(
     table_name: String,
-    table_desc: Rc<TableDesc>,
+    table_catalog: Arc<TableCatalog>,
     context: OptimizerContextRef,
     index_name: String,
     index_columns: &[(ExprImpl, OrderType)],
@@ -324,7 +326,7 @@ fn assemble_materialize(
 
     let logical_scan = LogicalScan::create(
         table_name,
-        table_desc.clone(),
+        table_catalog.clone(),
         // Index table has no indexes.
         vec![],
         context,
@@ -348,12 +350,12 @@ fn assemble_materialize(
     let out_names: Vec<String> = index_columns
         .iter()
         .map(|(expr, _)| match expr {
-            ExprImpl::InputRef(input_ref) => table_desc
-                .columns
+            ExprImpl::InputRef(input_ref) => table_catalog
+                .columns()
                 .get(input_ref.index)
                 .unwrap()
-                .name
-                .clone(),
+                .name()
+                .to_string(),
             ExprImpl::FunctionCall(func) => {
                 let func_name = func.func_type().as_str_name().to_string();
                 let mut name = func_name.clone();
@@ -367,12 +369,12 @@ fn assemble_materialize(
         })
         .chain(include_columns.iter().map(|expr| {
             match expr {
-                ExprImpl::InputRef(input_ref) => table_desc
-                    .columns
+                ExprImpl::InputRef(input_ref) => table_catalog
+                    .columns()
                     .get(input_ref.index)
                     .unwrap()
-                    .name
-                    .clone(),
+                    .name()
+                    .to_string(),
                 _ => unreachable!(),
             }
         }))
diff --git a/src/frontend/src/handler/create_mv.rs b/src/frontend/src/handler/create_mv.rs
index a504a92111cc4..716b3c0cdd852 100644
--- a/src/frontend/src/handler/create_mv.rs
+++ b/src/frontend/src/handler/create_mv.rs
@@ -189,6 +189,7 @@ It only indicates the physical clustering of the data, which may improve the per
             if plan.inputs().is_empty() {
                 if let Some(scan) = plan.as_stream_table_scan() {
                     scan.stream_scan_type() == StreamScanType::Backfill
+                        || scan.stream_scan_type() == StreamScanType::ArrangementBackfill
                 } else {
                     false
                 }
diff --git a/src/frontend/src/optimizer/plan_node/generic/scan.rs b/src/frontend/src/optimizer/plan_node/generic/scan.rs
index c32d942242683..cb033f80e0c6b 100644
--- a/src/frontend/src/optimizer/plan_node/generic/scan.rs
+++ b/src/frontend/src/optimizer/plan_node/generic/scan.rs
@@ -14,6 +14,7 @@
 
 use std::collections::{BTreeMap, HashMap};
 use std::rc::Rc;
+use std::sync::Arc;
 
 use educe::Educe;
 use fixedbitset::FixedBitSet;
@@ -28,6 +29,7 @@ use crate::expr::{Expr, ExprImpl, ExprRewriter, ExprVisitor, FunctionCall, Input
 use crate::optimizer::optimizer_context::OptimizerContextRef;
 use crate::optimizer::property::{Cardinality, FunctionalDependencySet, Order};
 use crate::utils::{ColIndexMappingRewriteExt, Condition};
+use crate::TableCatalog;
 
 /// [`Scan`] returns contents of a table or other equivalent object
 #[derive(Debug, Clone, Educe)]
@@ -37,7 +39,14 @@ pub struct Scan {
     /// Include `output_col_idx` and columns required in `predicate`
     pub required_col_idx: Vec<usize>,
     pub output_col_idx: Vec<usize>,
-    /// Descriptor of the table
+    /// Table Catalog of the upstream table that the descriptor is derived from.
+    pub table_catalog: Arc<TableCatalog>,
+    // FIXME(kwannoel): Currently many places in the code reference this,
+    // but now we have table catalog.
+    // We should remove this and use table catalog in those call-sites instead.
+    // It's introduced in https://github.com/risingwavelabs/risingwave/pull/13622.
+    // We kept this field to avoid extensive refactor in that PR.
+    /// Table Desc (subset of table catalog).
     pub table_desc: Rc<TableDesc>,
     /// Descriptors of all indexes on this table
     pub indexes: Vec<Rc<IndexCatalog>>,
@@ -172,7 +181,7 @@ impl Scan {
     pub fn to_index_scan(
         &self,
         index_name: &str,
-        index_table_desc: Rc<TableDesc>,
+        index_table_catalog: Arc<TableCatalog>,
         primary_to_secondary_mapping: &BTreeMap<usize, usize>,
         function_mapping: &HashMap<FunctionCall, usize>,
     ) -> Self {
@@ -221,7 +230,7 @@ impl Scan {
         Self::new(
             index_name.to_string(),
             new_output_col_idx,
-            index_table_desc,
+            index_table_catalog,
             vec![],
             self.ctx.clone(),
             new_predicate,
@@ -235,7 +244,7 @@ impl Scan {
     pub(crate) fn new(
         table_name: String,
         output_col_idx: Vec<usize>, // the column index in the table
-        table_desc: Rc<TableDesc>,
+        table_catalog: Arc<TableCatalog>,
         indexes: Vec<Rc<IndexCatalog>>,
         ctx: OptimizerContextRef,
         predicate: Condition, // refers to column indexes of the table
@@ -245,7 +254,7 @@ impl Scan {
         Self::new_inner(
             table_name,
             output_col_idx,
-            table_desc,
+            table_catalog,
             indexes,
             ctx,
             predicate,
@@ -258,7 +267,7 @@ impl Scan {
     pub(crate) fn new_inner(
         table_name: String,
         output_col_idx: Vec<usize>, // the column index in the table
-        table_desc: Rc<TableDesc>,
+        table_catalog: Arc<TableCatalog>,
         indexes: Vec<Rc<IndexCatalog>>,
         ctx: OptimizerContextRef,
         predicate: Condition, // refers to column indexes of the table
@@ -274,17 +283,20 @@ impl Scan {
         // required columns, i.e., the mapping from operator_idx to table_idx.
 
         let mut required_col_idx = output_col_idx.clone();
-        let predicate_col_idx = predicate.collect_input_refs(table_desc.columns.len());
+        let predicate_col_idx = predicate.collect_input_refs(table_catalog.columns().len());
         predicate_col_idx.ones().for_each(|idx| {
             if !required_col_idx.contains(&idx) {
                 required_col_idx.push(idx);
             }
         });
 
+        let table_desc = Rc::new(table_catalog.table_desc());
+
         Self {
             table_name,
             required_col_idx,
             output_col_idx,
+            table_catalog,
             table_desc,
             indexes,
             predicate,
diff --git a/src/frontend/src/optimizer/plan_node/logical_scan.rs b/src/frontend/src/optimizer/plan_node/logical_scan.rs
index 269633d5d74bd..36995ad4a3fe6 100644
--- a/src/frontend/src/optimizer/plan_node/logical_scan.rs
+++ b/src/frontend/src/optimizer/plan_node/logical_scan.rs
@@ -14,6 +14,7 @@
 
 use std::collections::{BTreeMap, HashSet};
 use std::rc::Rc;
+use std::sync::Arc;
 
 use fixedbitset::FixedBitSet;
 use itertools::Itertools;
@@ -21,6 +22,7 @@ use pretty_xmlish::{Pretty, XmlNode};
 use risingwave_common::catalog::{ColumnDesc, TableDesc};
 use risingwave_common::error::Result;
 use risingwave_common::util::sort_util::ColumnOrder;
+use risingwave_pb::stream_plan::StreamScanType;
 
 use super::generic::{GenericPlanNode, GenericPlanRef};
 use super::utils::{childless_record, Distill};
@@ -39,6 +41,7 @@ use crate::optimizer::plan_node::{
 use crate::optimizer::property::{Cardinality, Order};
 use crate::optimizer::rule::IndexSelectionRule;
 use crate::utils::{ColIndexMapping, Condition, ConditionDisplay};
+use crate::TableCatalog;
 
 /// `LogicalScan` returns contents of a table or other equivalent object
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
@@ -64,16 +67,17 @@ impl LogicalScan {
     /// Create a [`LogicalScan`] node. Used by planner.
     pub fn create(
         table_name: String, // explain-only
-        table_desc: Rc<TableDesc>,
+        table_catalog: Arc<TableCatalog>,
         indexes: Vec<Rc<IndexCatalog>>,
         ctx: OptimizerContextRef,
         for_system_time_as_of_proctime: bool,
         table_cardinality: Cardinality,
     ) -> Self {
+        let output_col_idx: Vec<usize> = (0..table_catalog.columns().len()).collect();
         generic::Scan::new(
             table_name,
-            (0..table_desc.columns.len()).collect(),
-            table_desc,
+            output_col_idx,
+            table_catalog,
             indexes,
             ctx,
             Condition::true_cond(),
@@ -96,11 +100,17 @@ impl LogicalScan {
         self.core.table_cardinality
     }
 
+    // FIXME(kwannoel): Fetch from `table_catalog` + lazily instantiate?
     /// Get a reference to the logical scan's table desc.
     pub fn table_desc(&self) -> &TableDesc {
         self.core.table_desc.as_ref()
     }
 
+    /// FIXME
+    pub fn table_catalog(&self) -> Arc<TableCatalog> {
+        self.core.table_catalog.clone()
+    }
+
     /// Get the descs of the output columns.
     pub fn column_descs(&self) -> Vec<ColumnDesc> {
         self.core.column_descs()
@@ -183,7 +193,7 @@ impl LogicalScan {
         {
             let index_scan = self.core.to_index_scan(
                 &index.name,
-                index.index_table.table_desc().into(),
+                index.index_table.clone(),
                 p2s_mapping,
                 index.function_mapping(),
             );
@@ -235,7 +245,7 @@ impl LogicalScan {
         let scan_without_predicate = generic::Scan::new(
             self.table_name().to_string(),
             self.required_col_idx().to_vec(),
-            self.core.table_desc.clone(),
+            self.core.table_catalog.clone(), // FIXME
             self.indexes().to_vec(),
             self.ctx(),
             Condition::true_cond(),
@@ -254,7 +264,7 @@ impl LogicalScan {
         generic::Scan::new_inner(
             self.table_name().to_string(),
             self.output_col_idx().to_vec(),
-            self.core.table_desc.clone(),
+            self.table_catalog(),
             self.indexes().to_vec(),
             self.base.ctx().clone(),
             predicate,
@@ -268,7 +278,7 @@ impl LogicalScan {
         generic::Scan::new_inner(
             self.table_name().to_string(),
             output_col_idx,
-            self.core.table_desc.clone(),
+            self.core.table_catalog.clone(),
             self.indexes().to_vec(),
             self.base.ctx().clone(),
             self.predicate().clone(),
@@ -512,7 +522,20 @@ impl ToBatch for LogicalScan {
 impl ToStream for LogicalScan {
     fn to_stream(&self, ctx: &mut ToStreamContext) -> Result<PlanRef> {
         if self.predicate().always_true() {
-            Ok(StreamTableScan::new(self.core.clone()).into())
+            if self
+                .ctx()
+                .session_ctx()
+                .config()
+                .streaming_enable_arrangement_backfill()
+            {
+                Ok(StreamTableScan::new_with_stream_scan_type(
+                    self.core.clone(),
+                    StreamScanType::ArrangementBackfill,
+                )
+                .into())
+            } else {
+                Ok(StreamTableScan::new(self.core.clone()).into())
+            }
         } else {
             let (scan, predicate, project_expr) = self.predicate_pull_up();
             let mut plan = LogicalFilter::create(scan.into(), predicate);
diff --git a/src/frontend/src/optimizer/plan_node/stream_hash_join.rs b/src/frontend/src/optimizer/plan_node/stream_hash_join.rs
index 514b3dfa7df1a..f83e56440fa66 100644
--- a/src/frontend/src/optimizer/plan_node/stream_hash_join.rs
+++ b/src/frontend/src/optimizer/plan_node/stream_hash_join.rs
@@ -136,38 +136,46 @@ impl StreamHashJoin {
                     continue;
                 }
 
-                let (internal, do_state_cleaning) = if key_required_larger < key_required_smaller {
-                    (
-                        l2i.try_map(key_required_larger),
-                        if !equal_condition_clean_state
-                            && clean_left_state_conjunction_idx.is_none()
-                        {
-                            clean_left_state_conjunction_idx = Some(conjunction_idx);
-                            true
-                        } else {
-                            false
-                        },
-                    )
-                } else {
-                    (
-                        r2i.try_map(key_required_larger - left_cols_num),
-                        if !equal_condition_clean_state
-                            && clean_right_state_conjunction_idx.is_none()
-                        {
-                            clean_right_state_conjunction_idx = Some(conjunction_idx);
-                            true
-                        } else {
-                            false
-                        },
-                    )
-                };
+                let (internal_col1, internal_col2, do_state_cleaning) =
+                    if key_required_larger < key_required_smaller {
+                        (
+                            l2i.try_map(key_required_larger),
+                            r2i.try_map(key_required_smaller - left_cols_num),
+                            if !equal_condition_clean_state
+                                && clean_left_state_conjunction_idx.is_none()
+                            {
+                                clean_left_state_conjunction_idx = Some(conjunction_idx);
+                                true
+                            } else {
+                                false
+                            },
+                        )
+                    } else {
+                        (
+                            r2i.try_map(key_required_larger - left_cols_num),
+                            l2i.try_map(key_required_smaller),
+                            if !equal_condition_clean_state
+                                && clean_right_state_conjunction_idx.is_none()
+                            {
+                                clean_right_state_conjunction_idx = Some(conjunction_idx);
+                                true
+                            } else {
+                                false
+                            },
+                        )
+                    };
                 let mut is_valuable_inequality = do_state_cleaning;
-                if let Some(internal) = internal
+                if let Some(internal) = internal_col1
                     && !watermark_columns.contains(internal)
                 {
                     watermark_columns.insert(internal);
                     is_valuable_inequality = true;
                 }
+                if let Some(internal) = internal_col2
+                    && !watermark_columns.contains(internal)
+                {
+                    watermark_columns.insert(internal);
+                }
                 if is_valuable_inequality {
                     inequality_pairs.push((
                         do_state_cleaning,
diff --git a/src/frontend/src/optimizer/plan_node/stream_table_scan.rs b/src/frontend/src/optimizer/plan_node/stream_table_scan.rs
index b40f1f758c3e4..7dfff36ed7af8 100644
--- a/src/frontend/src/optimizer/plan_node/stream_table_scan.rs
+++ b/src/frontend/src/optimizer/plan_node/stream_table_scan.rs
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 use std::collections::{BTreeMap, HashMap};
-use std::rc::Rc;
+use std::sync::Arc;
 
 use itertools::Itertools;
 use pretty_xmlish::{Pretty, XmlNode};
-use risingwave_common::catalog::{Field, TableDesc};
+use risingwave_common::catalog::Field;
 use risingwave_common::hash::VirtualNode;
 use risingwave_common::types::DataType;
 use risingwave_common::util::sort_util::OrderType;
@@ -97,14 +97,14 @@ impl StreamTableScan {
     pub fn to_index_scan(
         &self,
         index_name: &str,
-        index_table_desc: Rc<TableDesc>,
+        index_table_catalog: Arc<TableCatalog>,
         primary_to_secondary_mapping: &BTreeMap<usize, usize>,
         function_mapping: &HashMap<FunctionCall, usize>,
         stream_scan_type: StreamScanType,
     ) -> StreamTableScan {
         let logical_index_scan = self.core.to_index_scan(
             index_name,
-            index_table_desc,
+            index_table_catalog,
             primary_to_secondary_mapping,
             function_mapping,
         );
@@ -118,6 +118,11 @@ impl StreamTableScan {
         self.stream_scan_type
     }
 
+    // TODO: Add note to reviewer about safety, because of `generic::Scan` limitation.
+    fn get_upstream_state_table(&self) -> &TableCatalog {
+        self.core.table_catalog.as_ref()
+    }
+
     /// Build catalog for backfill state
     ///
     /// Schema: | vnode | pk ... | `backfill_finished` | `row_count` |
@@ -244,7 +249,9 @@ impl StreamTableScan {
         // The required columns from the table (both scan and upstream).
         let upstream_column_ids = match self.stream_scan_type {
             // For backfill, we additionally need the primary key columns.
-            StreamScanType::Backfill => self.core.output_and_pk_column_ids(),
+            StreamScanType::Backfill | StreamScanType::ArrangementBackfill => {
+                self.core.output_and_pk_column_ids()
+            }
             StreamScanType::Chain | StreamScanType::Rearrange | StreamScanType::UpstreamOnly => {
                 self.core.output_column_ids()
             }
@@ -270,6 +277,19 @@ impl StreamTableScan {
 
         let upstream_schema = snapshot_schema.clone();
 
+        // TODO: snapshot read of upstream mview
+        let batch_plan_node = BatchPlanNode {
+            table_desc: Some(self.core.table_desc.to_protobuf()),
+            column_ids: upstream_column_ids.clone(),
+        };
+
+        let catalog = self
+            .build_backfill_state_catalog(state)
+            .to_internal_table_prost();
+
+        // For backfill, we first read pk + output_indices from upstream.
+        // On this, we need to further project `output_indices` to the downstream.
+        // This `output_indices` refers to that.
         let output_indices = self
             .core
             .output_column_ids()
@@ -282,16 +302,14 @@ impl StreamTableScan {
             })
             .collect_vec();
 
-        // TODO: snapshot read of upstream mview
-        let batch_plan_node = BatchPlanNode {
-            table_desc: Some(self.core.table_desc.to_protobuf()),
-            column_ids: upstream_column_ids.clone(),
+        // This refers to the output indices of the originating stream.
+        let upstream_table_catalog = self.get_upstream_state_table().clone();
+        let arrangement_table = if self.stream_scan_type == StreamScanType::ArrangementBackfill {
+            Some(upstream_table_catalog.to_internal_table_prost())
+        } else {
+            None
         };
 
-        let catalog = self
-            .build_backfill_state_catalog(state)
-            .to_internal_table_prost();
-
         let node_body = PbNodeBody::StreamScan(StreamScanNode {
             table_id: self.core.table_desc.table_id.table_id,
             stream_scan_type: self.stream_scan_type as i32,
@@ -301,6 +319,7 @@ impl StreamTableScan {
             // The table desc used by backfill executor
             table_desc: Some(self.core.table_desc.to_protobuf()),
             state_table: Some(catalog),
+            arrangement_table,
             rate_limit: self.base.ctx().overwrite_options().streaming_rate_limit,
             ..Default::default()
         });
@@ -308,6 +327,7 @@ impl StreamTableScan {
         PbStreamNode {
             fields: self.schema().to_prost(),
             input: vec![
+                // Upstream updates
                 // The merge node body will be filled by the `ActorBuilder` on the meta service.
                 PbStreamNode {
                     node_body: Some(PbNodeBody::Merge(Default::default())),
@@ -316,6 +336,7 @@ impl StreamTableScan {
                     stream_key: vec![], // not used
                     ..Default::default()
                 },
+                // Snapshot read
                 PbStreamNode {
                     node_body: Some(PbNodeBody::BatchPlan(batch_plan_node)),
                     operator_id: self.batch_plan_id.0 as u64,
@@ -326,7 +347,6 @@ impl StreamTableScan {
                     append_only: true,
                 },
             ],
-
             node_body: Some(node_body),
             stream_key,
             operator_id: self.base.id().0 as u64,
diff --git a/src/frontend/src/optimizer/rule/index_delta_join_rule.rs b/src/frontend/src/optimizer/rule/index_delta_join_rule.rs
index c62e15220cbf1..56dcd17692d30 100644
--- a/src/frontend/src/optimizer/rule/index_delta_join_rule.rs
+++ b/src/frontend/src/optimizer/rule/index_delta_join_rule.rs
@@ -93,7 +93,7 @@ impl Rule for IndexDeltaJoinRule {
                     table_scan
                         .to_index_scan(
                             index.index_table.name.as_str(),
-                            index.index_table.table_desc().into(),
+                            index.index_table.clone(),
                             p2s_mapping,
                             index.function_mapping(),
                             stream_scan_type,
diff --git a/src/frontend/src/optimizer/rule/index_selection_rule.rs b/src/frontend/src/optimizer/rule/index_selection_rule.rs
index 15d45fda096e9..1e61baf64b82b 100644
--- a/src/frontend/src/optimizer/rule/index_selection_rule.rs
+++ b/src/frontend/src/optimizer/rule/index_selection_rule.rs
@@ -227,7 +227,7 @@ impl IndexSelectionRule {
 
         let index_scan = LogicalScan::create(
             index.index_table.name.clone(),
-            index.index_table.table_desc().into(),
+            index.index_table.clone(),
             vec![],
             logical_scan.ctx(),
             false,
@@ -236,7 +236,7 @@ impl IndexSelectionRule {
 
         let primary_table_scan = LogicalScan::create(
             index.primary_table.name.clone(),
-            index.primary_table.table_desc().into(),
+            (*index.primary_table).clone().into(),
             vec![],
             logical_scan.ctx(),
             false,
@@ -335,7 +335,7 @@ impl IndexSelectionRule {
 
         let primary_table_scan = LogicalScan::create(
             logical_scan.table_name().to_string(),
-            primary_table_desc.clone().into(),
+            logical_scan.table_catalog(),
             vec![],
             logical_scan.ctx(),
             false,
@@ -567,7 +567,8 @@ impl IndexSelectionRule {
                 .iter()
                 .map(|x| x.column_index)
                 .collect_vec(),
-            primary_table_desc.clone().into(),
+            // TODO: Should these be cloning the underlying rc instead?
+            logical_scan.table_catalog(),
             vec![],
             logical_scan.ctx(),
             Condition {
@@ -609,7 +610,7 @@ impl IndexSelectionRule {
                     .iter()
                     .map(|x| x.column_index)
                     .collect_vec(),
-                index.index_table.table_desc().into(),
+                index.index_table.clone(),
                 vec![],
                 ctx,
                 new_predicate,
diff --git a/src/frontend/src/planner/relation.rs b/src/frontend/src/planner/relation.rs
index 20a682bf7fb69..d009bd0b5d7bb 100644
--- a/src/frontend/src/planner/relation.rs
+++ b/src/frontend/src/planner/relation.rs
@@ -67,17 +67,19 @@ impl Planner {
     }
 
     pub(super) fn plan_base_table(&mut self, base_table: &BoundBaseTable) -> Result<PlanRef> {
+        let for_system_time_as_of_proctime = base_table.for_system_time_as_of_proctime;
+        let table_cardinality = base_table.table_catalog.cardinality;
         Ok(LogicalScan::create(
             base_table.table_catalog.name().to_string(),
-            Rc::new(base_table.table_catalog.table_desc()),
+            base_table.table_catalog.clone().into(),
             base_table
                 .table_indexes
                 .iter()
                 .map(|x| x.as_ref().clone().into())
                 .collect(),
             self.ctx(),
-            base_table.for_system_time_as_of_proctime,
-            base_table.table_catalog.cardinality,
+            for_system_time_as_of_proctime,
+            table_cardinality,
         )
         .into())
     }
diff --git a/src/frontend/src/scheduler/distributed/query.rs b/src/frontend/src/scheduler/distributed/query.rs
index 003c19d2ec9ac..347fcf38ce5f8 100644
--- a/src/frontend/src/scheduler/distributed/query.rs
+++ b/src/frontend/src/scheduler/distributed/query.rs
@@ -437,11 +437,13 @@ impl QueryRunner {
 #[cfg(test)]
 pub(crate) mod tests {
     use std::collections::HashMap;
-    use std::rc::Rc;
     use std::sync::{Arc, RwLock};
 
     use fixedbitset::FixedBitSet;
-    use risingwave_common::catalog::{ColumnDesc, TableDesc};
+    use risingwave_common::catalog::hummock::PROPERTIES_RETENTION_SECOND_KEY;
+    use risingwave_common::catalog::{
+        ColumnCatalog, ColumnDesc, ConflictBehavior, DEFAULT_SUPER_USER_ID,
+    };
     use risingwave_common::constants::hummock::TABLE_OPTION_DUMMY_RETENTION_SECOND;
     use risingwave_common::hash::ParallelUnitMapping;
     use risingwave_common::types::DataType;
@@ -452,6 +454,7 @@ pub(crate) mod tests {
 
     use crate::catalog::catalog_service::CatalogReader;
     use crate::catalog::root_catalog::Catalog;
+    use crate::catalog::table_catalog::{CreateType, TableType};
     use crate::expr::InputRef;
     use crate::optimizer::plan_node::{
         generic, BatchExchange, BatchFilter, BatchHashJoin, EqJoinPredicate, LogicalScan, ToBatch,
@@ -468,6 +471,7 @@ pub(crate) mod tests {
     use crate::session::SessionImpl;
     use crate::test_utils::MockFrontendMetaClient;
     use crate::utils::Condition;
+    use crate::{TableCatalog, WithOptions};
 
     #[tokio::test]
     async fn test_query_should_not_hang_with_empty_worker() {
@@ -514,25 +518,60 @@ pub(crate) mod tests {
         //
         let ctx = OptimizerContext::mock().await;
         let table_id = 0.into();
+        let table_catalog: TableCatalog = TableCatalog {
+            id: table_id,
+            associated_source_id: None,
+            name: "test".to_string(),
+            columns: vec![
+                ColumnCatalog {
+                    column_desc: ColumnDesc::new_atomic(DataType::Int32, "a", 0),
+                    is_hidden: false,
+                },
+                ColumnCatalog {
+                    column_desc: ColumnDesc::new_atomic(DataType::Float64, "b", 1),
+                    is_hidden: false,
+                },
+                ColumnCatalog {
+                    column_desc: ColumnDesc::new_atomic(DataType::Int64, "c", 2),
+                    is_hidden: false,
+                },
+            ],
+            pk: vec![],
+            stream_key: vec![],
+            table_type: TableType::Table,
+            distribution_key: vec![],
+            append_only: false,
+            owner: DEFAULT_SUPER_USER_ID,
+            properties: WithOptions::new(
+                [(
+                    PROPERTIES_RETENTION_SECOND_KEY.into(),
+                    TABLE_OPTION_DUMMY_RETENTION_SECOND.to_string(),
+                )]
+                .into_iter()
+                .collect(),
+            ),
+            fragment_id: 0,        // FIXME
+            dml_fragment_id: None, // FIXME
+            vnode_col_index: None,
+            row_id_index: None,
+            value_indices: vec![0, 1, 2],
+            definition: "".to_string(),
+            conflict_behavior: ConflictBehavior::NoCheck,
+            read_prefix_len_hint: 0,
+            version: None,
+            watermark_columns: FixedBitSet::with_capacity(3),
+            dist_key_in_pk: vec![],
+            cardinality: Cardinality::unknown(),
+            cleaned_by_watermark: false,
+            created_at_epoch: None,
+            initialized_at_epoch: None,
+            create_type: CreateType::Foreground,
+            description: None,
+            incoming_sinks: vec![],
+        };
         let batch_plan_node: PlanRef = LogicalScan::create(
             "".to_string(),
-            Rc::new(TableDesc {
-                table_id,
-                stream_key: vec![],
-                pk: vec![],
-                columns: vec![
-                    ColumnDesc::new_atomic(DataType::Int32, "a", 0),
-                    ColumnDesc::new_atomic(DataType::Float64, "b", 1),
-                    ColumnDesc::new_atomic(DataType::Int64, "c", 2),
-                ],
-                distribution_key: vec![],
-                append_only: false,
-                retention_seconds: TABLE_OPTION_DUMMY_RETENTION_SECOND,
-                value_indices: vec![0, 1, 2],
-                read_prefix_len_hint: 0,
-                watermark_columns: FixedBitSet::with_capacity(3),
-                versioned: false,
-            }),
+            table_catalog.into(),
             vec![],
             ctx,
             false,
diff --git a/src/meta/src/hummock/manager/mod.rs b/src/meta/src/hummock/manager/mod.rs
index 3bd53d67f07d8..06dcd63b69313 100644
--- a/src/meta/src/hummock/manager/mod.rs
+++ b/src/meta/src/hummock/manager/mod.rs
@@ -268,14 +268,14 @@ pub enum CompactionResumeTrigger {
 
 pub struct CommitEpochInfo {
     pub sstables: Vec<ExtendedSstableInfo>,
-    pub new_table_watermarks: HashMap<u64, TableWatermarks>,
+    pub new_table_watermarks: HashMap<u32, TableWatermarks>,
     pub sst_to_context: HashMap<HummockSstableObjectId, HummockContextId>,
 }
 
 impl CommitEpochInfo {
     pub fn new(
         sstables: Vec<ExtendedSstableInfo>,
-        new_table_watermarks: HashMap<u64, TableWatermarks>,
+        new_table_watermarks: HashMap<u32, TableWatermarks>,
         sst_to_context: HashMap<HummockSstableObjectId, HummockContextId>,
     ) -> Self {
         Self {
@@ -975,6 +975,8 @@ impl HummockManager {
                 .retain(|table_id, _| compact_task.existing_table_ids.contains(table_id));
 
             compact_task.table_vnode_partition = table_to_vnode_partition;
+            compact_task.table_watermarks =
+                current_version.safe_epoch_table_watermarks(&compact_task.existing_table_ids);
 
             let mut compact_task_assignment =
                 BTreeMapTransaction::new(&mut compaction.compact_task_assignment);
diff --git a/src/meta/src/hummock/mock_hummock_meta_client.rs b/src/meta/src/hummock/mock_hummock_meta_client.rs
index e85157ef9b03e..9b0b66fc1e03f 100644
--- a/src/meta/src/hummock/mock_hummock_meta_client.rs
+++ b/src/meta/src/hummock/mock_hummock_meta_client.rs
@@ -94,7 +94,7 @@ impl MockHummockMetaClient {
         &self,
         epoch: HummockEpoch,
         sstables: Vec<LocalSstableInfo>,
-        new_table_watermarks: HashMap<u64, TableWatermarks>,
+        new_table_watermarks: HashMap<u32, TableWatermarks>,
     ) -> Result<()> {
         let sst_to_worker = sstables
             .iter()
diff --git a/src/meta/src/manager/catalog/fragment.rs b/src/meta/src/manager/catalog/fragment.rs
index 4873420a07aa8..1a95d84371cb2 100644
--- a/src/meta/src/manager/catalog/fragment.rs
+++ b/src/meta/src/manager/catalog/fragment.rs
@@ -184,6 +184,7 @@ impl FragmentManager {
             let is_backfill = if let Some(node) = &stream_node.node_body
             && let Some(node) = node.as_stream_scan() {
                 node.stream_scan_type == StreamScanType::Backfill as i32
+                || node.stream_scan_type == StreamScanType::ArrangementBackfill as i32
             } else {
                 false
             };
diff --git a/src/meta/src/stream/stream_graph/actor.rs b/src/meta/src/stream/stream_graph/actor.rs
index e5579552989ce..8c0472aae14f1 100644
--- a/src/meta/src/stream/stream_graph/actor.rs
+++ b/src/meta/src/stream/stream_graph/actor.rs
@@ -174,6 +174,7 @@ impl ActorBuilder {
                     downstream_fragment_id: self.fragment_id,
                 }];
 
+                // FIXME(kwannoel): This may not hold for Arrangement Backfill.
                 // As we always use the `NoShuffle` exchange for MV on MV, there should be only one
                 // upstream.
                 let upstream_actor_id = upstreams.actors.as_global_ids();
diff --git a/src/object_store/src/object/mod.rs b/src/object_store/src/object/mod.rs
index da846b7136a46..8e1ec690dd194 100644
--- a/src/object_store/src/object/mod.rs
+++ b/src/object_store/src/object/mod.rs
@@ -621,7 +621,11 @@ impl<OS: ObjectStore> MonitoredObjectStore<OS> {
                 .unwrap_or_else(|_| Err(ObjectError::internal("read timeout"))),
         };
 
-        try_update_failure_metric(&self.object_store_metrics, &res, operation_type);
+        if let Err(e) = &res && e.is_object_not_found_error() && path.ends_with("manifest.json") {
+            // Metadata backup's manifest.json not found is expected.
+        } else {
+            try_update_failure_metric(&self.object_store_metrics, &res, operation_type);
+        }
 
         let data = res?;
         self.object_store_metrics
diff --git a/src/prost/build.rs b/src/prost/build.rs
index dcc4257e627b4..5b8ddda59e098 100644
--- a/src/prost/build.rs
+++ b/src/prost/build.rs
@@ -116,6 +116,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         .type_attribute("plan_common.ColumnDesc", "#[derive(Eq, Hash)]")
         .type_attribute("common.ColumnOrder", "#[derive(Eq, Hash)]")
         .type_attribute("common.OrderType", "#[derive(Eq, Hash)]")
+        .type_attribute("common.Buffer", "#[derive(Eq)]")
         // Eq is required to derive `FromJsonQueryResult` for models in risingwave_meta_model_v2.
         .type_attribute("hummock.TableStats", "#[derive(Eq)]")
         .type_attribute("hummock.SstableInfo", "#[derive(Eq)]")
@@ -133,6 +134,12 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         .type_attribute("hummock.TableOption", "#[derive(Eq)]")
         .type_attribute("hummock.InputLevel", "#[derive(Eq)]")
         .type_attribute("hummock.CompactTask", "#[derive(Eq)]")
+        .type_attribute("hummock.TableWatermarks", "#[derive(Eq)]")
+        .type_attribute("hummock.VnodeWatermark", "#[derive(Eq)]")
+        .type_attribute(
+            "hummock.TableWatermarks.EpochNewWatermarks",
+            "#[derive(Eq)]",
+        )
         // ===================
         .out_dir(out_dir.as_path())
         .compile(&protos, &[proto_dir.to_string()])
diff --git a/src/sqlparser/src/ast/legacy_source.rs b/src/sqlparser/src/ast/legacy_source.rs
new file mode 100644
index 0000000000000..dbc25d1b927e2
--- /dev/null
+++ b/src/sqlparser/src/ast/legacy_source.rs
@@ -0,0 +1,430 @@
+// Copyright 2023 RisingWave Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Content of this file can be deleted once we stop supporting `create source` syntax v1.
+//! New features shall NOT touch this file.
+
+use std::fmt;
+
+use itertools::Itertools as _;
+#[cfg(feature = "serde")]
+use serde::{Deserialize, Serialize};
+
+use crate::ast::{
+    AstString, AstVec, ConnectorSchema, Encode, Format, Ident, ObjectName, ParseTo, SqlOption,
+    Value,
+};
+use crate::keywords::Keyword;
+use crate::parser::{Parser, ParserError};
+use crate::{impl_fmt_display, impl_parse_to};
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+pub enum CompatibleSourceSchema {
+    RowFormat(SourceSchema),
+    V2(ConnectorSchema),
+}
+
+impl fmt::Display for CompatibleSourceSchema {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            CompatibleSourceSchema::RowFormat(inner) => {
+                write!(f, "{}", inner)
+            }
+            CompatibleSourceSchema::V2(inner) => {
+                write!(f, "{}", inner)
+            }
+        }
+    }
+}
+
+impl CompatibleSourceSchema {
+    pub(crate) fn into_v2(self) -> ConnectorSchema {
+        match self {
+            CompatibleSourceSchema::RowFormat(inner) => inner.into_source_schema_v2(),
+            CompatibleSourceSchema::V2(inner) => inner,
+        }
+    }
+}
+
+impl From<ConnectorSchema> for CompatibleSourceSchema {
+    fn from(value: ConnectorSchema) -> Self {
+        Self::V2(value)
+    }
+}
+
+pub fn parse_source_schema(p: &mut Parser) -> Result<CompatibleSourceSchema, ParserError> {
+    if let Some(schema_v2) = p.parse_schema()? {
+        Ok(CompatibleSourceSchema::V2(schema_v2))
+    } else if p.peek_nth_any_of_keywords(0, &[Keyword::ROW])
+        && p.peek_nth_any_of_keywords(1, &[Keyword::FORMAT])
+    {
+        p.expect_keyword(Keyword::ROW)?;
+        p.expect_keyword(Keyword::FORMAT)?;
+        let id = p.parse_identifier()?;
+        let value = id.value.to_ascii_uppercase();
+        let schema = match &value[..] {
+            "JSON" => SourceSchema::Json,
+            "UPSERT_JSON" => SourceSchema::UpsertJson,
+            "PROTOBUF" => {
+                impl_parse_to!(protobuf_schema: ProtobufSchema, p);
+                SourceSchema::Protobuf(protobuf_schema)
+            }
+            "DEBEZIUM_JSON" => SourceSchema::DebeziumJson,
+            "DEBEZIUM_MONGO_JSON" => SourceSchema::DebeziumMongoJson,
+            "AVRO" => {
+                impl_parse_to!(avro_schema: AvroSchema, p);
+                SourceSchema::Avro(avro_schema)
+            }
+            "UPSERT_AVRO" => {
+                impl_parse_to!(avro_schema: AvroSchema, p);
+                SourceSchema::UpsertAvro(avro_schema)
+            }
+            "MAXWELL" => SourceSchema::Maxwell,
+            "CANAL_JSON" => SourceSchema::CanalJson,
+            "CSV" => {
+                impl_parse_to!(csv_info: CsvInfo, p);
+                SourceSchema::Csv(csv_info)
+            }
+            "NATIVE" => SourceSchema::Native, // used internally by schema change
+            "DEBEZIUM_AVRO" => {
+                impl_parse_to!(avro_schema: DebeziumAvroSchema, p);
+                SourceSchema::DebeziumAvro(avro_schema)
+            }
+            "BYTES" => SourceSchema::Bytes,
+            _ => {
+                return Err(ParserError::ParserError(
+                    "expected JSON | UPSERT_JSON | PROTOBUF | DEBEZIUM_JSON | DEBEZIUM_AVRO \
+                    | AVRO | UPSERT_AVRO | MAXWELL | CANAL_JSON | BYTES | NATIVE after ROW FORMAT"
+                        .to_string(),
+                ))
+            }
+        };
+        Ok(CompatibleSourceSchema::RowFormat(schema))
+    } else {
+        p.expected("description of the format", p.peek_token())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+pub enum SourceSchema {
+    Protobuf(ProtobufSchema),
+    // Keyword::PROTOBUF ProtobufSchema
+    Json,         // Keyword::JSON
+    DebeziumJson, // Keyword::DEBEZIUM_JSON
+    DebeziumMongoJson,
+    UpsertJson,             // Keyword::UPSERT_JSON
+    Avro(AvroSchema),       // Keyword::AVRO
+    UpsertAvro(AvroSchema), // Keyword::UpsertAVRO
+    Maxwell,                // Keyword::MAXWELL
+    CanalJson,              // Keyword::CANAL_JSON
+    Csv(CsvInfo),           // Keyword::CSV
+    Native,
+    DebeziumAvro(DebeziumAvroSchema), // Keyword::DEBEZIUM_AVRO
+    Bytes,
+}
+
+impl SourceSchema {
+    pub fn into_source_schema_v2(self) -> ConnectorSchema {
+        let (format, row_encode) = match self {
+            SourceSchema::Protobuf(_) => (Format::Plain, Encode::Protobuf),
+            SourceSchema::Json => (Format::Plain, Encode::Json),
+            SourceSchema::DebeziumJson => (Format::Debezium, Encode::Json),
+            SourceSchema::DebeziumMongoJson => (Format::DebeziumMongo, Encode::Json),
+            SourceSchema::UpsertJson => (Format::Upsert, Encode::Json),
+            SourceSchema::Avro(_) => (Format::Plain, Encode::Avro),
+            SourceSchema::UpsertAvro(_) => (Format::Upsert, Encode::Avro),
+            SourceSchema::Maxwell => (Format::Maxwell, Encode::Json),
+            SourceSchema::CanalJson => (Format::Canal, Encode::Json),
+            SourceSchema::Csv(_) => (Format::Plain, Encode::Csv),
+            SourceSchema::DebeziumAvro(_) => (Format::Debezium, Encode::Avro),
+            SourceSchema::Bytes => (Format::Plain, Encode::Bytes),
+            SourceSchema::Native => (Format::Native, Encode::Native),
+        };
+
+        let row_options = match self {
+            SourceSchema::Protobuf(schema) => {
+                let mut options = vec![SqlOption {
+                    name: ObjectName(vec![Ident {
+                        value: "message".into(),
+                        quote_style: None,
+                    }]),
+                    value: Value::SingleQuotedString(schema.message_name.0),
+                }];
+                if schema.use_schema_registry {
+                    options.push(SqlOption {
+                        name: ObjectName(vec![Ident {
+                            value: "schema.registry".into(),
+                            quote_style: None,
+                        }]),
+                        value: Value::SingleQuotedString(schema.row_schema_location.0),
+                    });
+                } else {
+                    options.push(SqlOption {
+                        name: ObjectName(vec![Ident {
+                            value: "schema.location".into(),
+                            quote_style: None,
+                        }]),
+                        value: Value::SingleQuotedString(schema.row_schema_location.0),
+                    })
+                }
+                options
+            }
+            SourceSchema::Avro(schema) | SourceSchema::UpsertAvro(schema) => {
+                if schema.use_schema_registry {
+                    vec![SqlOption {
+                        name: ObjectName(vec![Ident {
+                            value: "schema.registry".into(),
+                            quote_style: None,
+                        }]),
+                        value: Value::SingleQuotedString(schema.row_schema_location.0),
+                    }]
+                } else {
+                    vec![SqlOption {
+                        name: ObjectName(vec![Ident {
+                            value: "schema.location".into(),
+                            quote_style: None,
+                        }]),
+                        value: Value::SingleQuotedString(schema.row_schema_location.0),
+                    }]
+                }
+            }
+            SourceSchema::DebeziumAvro(schema) => {
+                vec![SqlOption {
+                    name: ObjectName(vec![Ident {
+                        value: "schema.registry".into(),
+                        quote_style: None,
+                    }]),
+                    value: Value::SingleQuotedString(schema.row_schema_location.0),
+                }]
+            }
+            SourceSchema::Csv(schema) => {
+                vec![
+                    SqlOption {
+                        name: ObjectName(vec![Ident {
+                            value: "delimiter".into(),
+                            quote_style: None,
+                        }]),
+                        value: Value::SingleQuotedString(
+                            String::from_utf8_lossy(&[schema.delimiter]).into(),
+                        ),
+                    },
+                    SqlOption {
+                        name: ObjectName(vec![Ident {
+                            value: "without_header".into(),
+                            quote_style: None,
+                        }]),
+                        value: Value::SingleQuotedString(if schema.has_header {
+                            "false".into()
+                        } else {
+                            "true".into()
+                        }),
+                    },
+                ]
+            }
+            _ => vec![],
+        };
+
+        ConnectorSchema {
+            format,
+            row_encode,
+            row_options,
+        }
+    }
+}
+
+impl fmt::Display for SourceSchema {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "ROW FORMAT ")?;
+        match self {
+            SourceSchema::Protobuf(protobuf_schema) => write!(f, "PROTOBUF {}", protobuf_schema),
+            SourceSchema::Json => write!(f, "JSON"),
+            SourceSchema::UpsertJson => write!(f, "UPSERT_JSON"),
+            SourceSchema::Maxwell => write!(f, "MAXWELL"),
+            SourceSchema::DebeziumJson => write!(f, "DEBEZIUM_JSON"),
+            SourceSchema::DebeziumMongoJson => write!(f, "DEBEZIUM_MONGO_JSON"),
+            SourceSchema::Avro(avro_schema) => write!(f, "AVRO {}", avro_schema),
+            SourceSchema::UpsertAvro(avro_schema) => write!(f, "UPSERT_AVRO {}", avro_schema),
+            SourceSchema::CanalJson => write!(f, "CANAL_JSON"),
+            SourceSchema::Csv(csv_info) => write!(f, "CSV {}", csv_info),
+            SourceSchema::Native => write!(f, "NATIVE"),
+            SourceSchema::DebeziumAvro(avro_schema) => write!(f, "DEBEZIUM_AVRO {}", avro_schema),
+            SourceSchema::Bytes => write!(f, "BYTES"),
+        }
+    }
+}
+
+// sql_grammar!(ProtobufSchema {
+//     [Keyword::MESSAGE],
+//     message_name: AstString,
+//     [Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION],
+//     row_schema_location: AstString,
+// });
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+pub struct ProtobufSchema {
+    pub message_name: AstString,
+    pub row_schema_location: AstString,
+    pub use_schema_registry: bool,
+}
+
+impl ParseTo for ProtobufSchema {
+    fn parse_to(p: &mut Parser) -> Result<Self, ParserError> {
+        impl_parse_to!([Keyword::MESSAGE], p);
+        impl_parse_to!(message_name: AstString, p);
+        impl_parse_to!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], p);
+        impl_parse_to!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], p);
+        impl_parse_to!(row_schema_location: AstString, p);
+        Ok(Self {
+            message_name,
+            row_schema_location,
+            use_schema_registry,
+        })
+    }
+}
+
+impl fmt::Display for ProtobufSchema {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let mut v: Vec<String> = vec![];
+        impl_fmt_display!([Keyword::MESSAGE], v);
+        impl_fmt_display!(message_name, v, self);
+        impl_fmt_display!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], v);
+        impl_fmt_display!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], v, self);
+        impl_fmt_display!(row_schema_location, v, self);
+        v.iter().join(" ").fmt(f)
+    }
+}
+
+// sql_grammar!(AvroSchema {
+//     [Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION, [Keyword::CONFLUENT, Keyword::SCHEMA,
+// Keyword::REGISTRY]],     row_schema_location: AstString,
+// });
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+pub struct AvroSchema {
+    pub row_schema_location: AstString,
+    pub use_schema_registry: bool,
+}
+impl ParseTo for AvroSchema {
+    fn parse_to(p: &mut Parser) -> Result<Self, ParserError> {
+        impl_parse_to!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], p);
+        impl_parse_to!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], p);
+        impl_parse_to!(row_schema_location: AstString, p);
+        Ok(Self {
+            row_schema_location,
+            use_schema_registry,
+        })
+    }
+}
+
+impl fmt::Display for AvroSchema {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let mut v: Vec<String> = vec![];
+        impl_fmt_display!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], v);
+        impl_fmt_display!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], v, self);
+        impl_fmt_display!(row_schema_location, v, self);
+        v.iter().join(" ").fmt(f)
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+pub struct DebeziumAvroSchema {
+    pub row_schema_location: AstString,
+}
+
+impl fmt::Display for DebeziumAvroSchema {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let mut v: Vec<String> = vec![];
+        impl_fmt_display!(
+            [
+                Keyword::ROW,
+                Keyword::SCHEMA,
+                Keyword::LOCATION,
+                Keyword::CONFLUENT,
+                Keyword::SCHEMA,
+                Keyword::REGISTRY
+            ],
+            v
+        );
+        impl_fmt_display!(row_schema_location, v, self);
+        v.iter().join(" ").fmt(f)
+    }
+}
+
+impl ParseTo for DebeziumAvroSchema {
+    fn parse_to(p: &mut Parser) -> Result<Self, ParserError> {
+        impl_parse_to!(
+            [
+                Keyword::ROW,
+                Keyword::SCHEMA,
+                Keyword::LOCATION,
+                Keyword::CONFLUENT,
+                Keyword::SCHEMA,
+                Keyword::REGISTRY
+            ],
+            p
+        );
+        impl_parse_to!(row_schema_location: AstString, p);
+        Ok(Self {
+            row_schema_location,
+        })
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+pub struct CsvInfo {
+    pub delimiter: u8,
+    pub has_header: bool,
+}
+
+pub fn get_delimiter(chars: &str) -> Result<u8, ParserError> {
+    match chars {
+        "," => Ok(b','),   // comma
+        "\t" => Ok(b'\t'), // tab
+        other => Err(ParserError::ParserError(format!(
+            "The delimiter should be one of ',', E'\\t', but got {:?}",
+            other
+        ))),
+    }
+}
+
+impl ParseTo for CsvInfo {
+    fn parse_to(p: &mut Parser) -> Result<Self, ParserError> {
+        impl_parse_to!(without_header => [Keyword::WITHOUT, Keyword::HEADER], p);
+        impl_parse_to!([Keyword::DELIMITED, Keyword::BY], p);
+        impl_parse_to!(delimiter: AstString, p);
+        let delimiter = get_delimiter(delimiter.0.as_str())?;
+        Ok(Self {
+            delimiter,
+            has_header: !without_header,
+        })
+    }
+}
+
+impl fmt::Display for CsvInfo {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let mut v: Vec<String> = vec![];
+        if !self.has_header {
+            v.push(format!(
+                "{}",
+                AstVec([Keyword::WITHOUT, Keyword::HEADER].to_vec())
+            ));
+        }
+        impl_fmt_display!(delimiter, v, self);
+        v.iter().join(" ").fmt(f)
+    }
+}
diff --git a/src/sqlparser/src/ast/mod.rs b/src/sqlparser/src/ast/mod.rs
index a57a6a9175ebd..4ccfaf0ee8f90 100644
--- a/src/sqlparser/src/ast/mod.rs
+++ b/src/sqlparser/src/ast/mod.rs
@@ -13,6 +13,7 @@
 //! SQL Abstract Syntax Tree (AST) types
 mod data_type;
 pub(crate) mod ddl;
+mod legacy_source;
 mod operator;
 mod query;
 mod statement;
@@ -36,6 +37,9 @@ pub use self::ddl::{
     AlterSchemaOperation, AlterTableOperation, ColumnDef, ColumnOption, ColumnOptionDef,
     ReferentialAction, SourceWatermark, TableConstraint,
 };
+pub use self::legacy_source::{
+    get_delimiter, AvroSchema, CompatibleSourceSchema, DebeziumAvroSchema, ProtobufSchema,
+};
 pub use self::operator::{BinaryOperator, QualifiedOperator, UnaryOperator};
 pub use self::query::{
     Cte, Distinct, Fetch, Join, JoinConstraint, JoinOperator, LateralView, OrderByExpr, Query,
diff --git a/src/sqlparser/src/ast/statement.rs b/src/sqlparser/src/ast/statement.rs
index 133688875fd6e..f50a6a1c45450 100644
--- a/src/sqlparser/src/ast/statement.rs
+++ b/src/sqlparser/src/ast/statement.rs
@@ -20,7 +20,8 @@ use itertools::Itertools;
 use serde::{Deserialize, Serialize};
 
 use super::ddl::SourceWatermark;
-use super::{EmitMode, Ident, ObjectType, Query, Value};
+use super::legacy_source::{parse_source_schema, CompatibleSourceSchema};
+use super::{EmitMode, Ident, ObjectType, Query};
 use crate::ast::{
     display_comma_separated, display_separated, ColumnDef, ObjectName, SqlOption, TableConstraint,
 };
@@ -33,6 +34,7 @@ pub trait ParseTo: Sized {
     fn parse_to(parser: &mut Parser) -> Result<Self, ParserError>;
 }
 
+#[macro_export]
 macro_rules! impl_parse_to {
     () => {};
     ($field:ident : $field_type:ty, $parser:ident) => {
@@ -46,6 +48,7 @@ macro_rules! impl_parse_to {
     };
 }
 
+#[macro_export]
 macro_rules! impl_fmt_display {
     () => {};
     ($field:ident, $v:ident, $self:ident) => {{
@@ -84,155 +87,6 @@ pub struct CreateSourceStatement {
     pub source_watermarks: Vec<SourceWatermark>,
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
-pub enum SourceSchema {
-    Protobuf(ProtobufSchema),
-    // Keyword::PROTOBUF ProtobufSchema
-    Json,         // Keyword::JSON
-    DebeziumJson, // Keyword::DEBEZIUM_JSON
-    DebeziumMongoJson,
-    UpsertJson,             // Keyword::UPSERT_JSON
-    Avro(AvroSchema),       // Keyword::AVRO
-    UpsertAvro(AvroSchema), // Keyword::UpsertAVRO
-    Maxwell,                // Keyword::MAXWELL
-    CanalJson,              // Keyword::CANAL_JSON
-    Csv(CsvInfo),           // Keyword::CSV
-    Native,
-    DebeziumAvro(DebeziumAvroSchema), // Keyword::DEBEZIUM_AVRO
-    Bytes,
-}
-
-impl SourceSchema {
-    pub fn into_source_schema_v2(self) -> ConnectorSchema {
-        let (format, row_encode) = match self {
-            SourceSchema::Protobuf(_) => (Format::Plain, Encode::Protobuf),
-            SourceSchema::Json => (Format::Plain, Encode::Json),
-            SourceSchema::DebeziumJson => (Format::Debezium, Encode::Json),
-            SourceSchema::DebeziumMongoJson => (Format::DebeziumMongo, Encode::Json),
-            SourceSchema::UpsertJson => (Format::Upsert, Encode::Json),
-            SourceSchema::Avro(_) => (Format::Plain, Encode::Avro),
-            SourceSchema::UpsertAvro(_) => (Format::Upsert, Encode::Avro),
-            SourceSchema::Maxwell => (Format::Maxwell, Encode::Json),
-            SourceSchema::CanalJson => (Format::Canal, Encode::Json),
-            SourceSchema::Csv(_) => (Format::Plain, Encode::Csv),
-            SourceSchema::DebeziumAvro(_) => (Format::Debezium, Encode::Avro),
-            SourceSchema::Bytes => (Format::Plain, Encode::Bytes),
-            SourceSchema::Native => (Format::Native, Encode::Native),
-        };
-
-        let row_options = match self {
-            SourceSchema::Protobuf(schema) => {
-                let mut options = vec![SqlOption {
-                    name: ObjectName(vec![Ident {
-                        value: "message".into(),
-                        quote_style: None,
-                    }]),
-                    value: Value::SingleQuotedString(schema.message_name.0),
-                }];
-                if schema.use_schema_registry {
-                    options.push(SqlOption {
-                        name: ObjectName(vec![Ident {
-                            value: "schema.registry".into(),
-                            quote_style: None,
-                        }]),
-                        value: Value::SingleQuotedString(schema.row_schema_location.0),
-                    });
-                } else {
-                    options.push(SqlOption {
-                        name: ObjectName(vec![Ident {
-                            value: "schema.location".into(),
-                            quote_style: None,
-                        }]),
-                        value: Value::SingleQuotedString(schema.row_schema_location.0),
-                    })
-                }
-                options
-            }
-            SourceSchema::Avro(schema) | SourceSchema::UpsertAvro(schema) => {
-                if schema.use_schema_registry {
-                    vec![SqlOption {
-                        name: ObjectName(vec![Ident {
-                            value: "schema.registry".into(),
-                            quote_style: None,
-                        }]),
-                        value: Value::SingleQuotedString(schema.row_schema_location.0),
-                    }]
-                } else {
-                    vec![SqlOption {
-                        name: ObjectName(vec![Ident {
-                            value: "schema.location".into(),
-                            quote_style: None,
-                        }]),
-                        value: Value::SingleQuotedString(schema.row_schema_location.0),
-                    }]
-                }
-            }
-            SourceSchema::DebeziumAvro(schema) => {
-                vec![SqlOption {
-                    name: ObjectName(vec![Ident {
-                        value: "schema.registry".into(),
-                        quote_style: None,
-                    }]),
-                    value: Value::SingleQuotedString(schema.row_schema_location.0),
-                }]
-            }
-            SourceSchema::Csv(schema) => {
-                vec![
-                    SqlOption {
-                        name: ObjectName(vec![Ident {
-                            value: "delimiter".into(),
-                            quote_style: None,
-                        }]),
-                        value: Value::SingleQuotedString(
-                            String::from_utf8_lossy(&[schema.delimiter]).into(),
-                        ),
-                    },
-                    SqlOption {
-                        name: ObjectName(vec![Ident {
-                            value: "without_header".into(),
-                            quote_style: None,
-                        }]),
-                        value: Value::SingleQuotedString(if schema.has_header {
-                            "false".into()
-                        } else {
-                            "true".into()
-                        }),
-                    },
-                ]
-            }
-            _ => vec![],
-        };
-
-        ConnectorSchema {
-            format,
-            row_encode,
-            row_options,
-        }
-    }
-}
-
-impl fmt::Display for SourceSchema {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "ROW FORMAT ")?;
-        match self {
-            SourceSchema::Protobuf(protobuf_schema) => write!(f, "PROTOBUF {}", protobuf_schema),
-            SourceSchema::Json => write!(f, "JSON"),
-            SourceSchema::UpsertJson => write!(f, "UPSERT_JSON"),
-            SourceSchema::Maxwell => write!(f, "MAXWELL"),
-            SourceSchema::DebeziumJson => write!(f, "DEBEZIUM_JSON"),
-            SourceSchema::DebeziumMongoJson => write!(f, "DEBEZIUM_MONGO_JSON"),
-            SourceSchema::Avro(avro_schema) => write!(f, "AVRO {}", avro_schema),
-            SourceSchema::UpsertAvro(avro_schema) => write!(f, "UPSERT_AVRO {}", avro_schema),
-            SourceSchema::CanalJson => write!(f, "CANAL_JSON"),
-            SourceSchema::Csv(csv_info) => write!(f, "CSV {}", csv_info),
-            SourceSchema::Native => write!(f, "NATIVE"),
-            SourceSchema::DebeziumAvro(avro_schema) => write!(f, "DEBEZIUM_AVRO {}", avro_schema),
-            SourceSchema::Bytes => write!(f, "BYTES"),
-        }
-    }
-}
-
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub enum Format {
@@ -341,96 +195,6 @@ pub struct ConnectorSchema {
     pub row_options: Vec<SqlOption>,
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
-pub enum CompatibleSourceSchema {
-    RowFormat(SourceSchema),
-    V2(ConnectorSchema),
-}
-
-impl fmt::Display for CompatibleSourceSchema {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            CompatibleSourceSchema::RowFormat(inner) => {
-                write!(f, "{}", inner)
-            }
-            CompatibleSourceSchema::V2(inner) => {
-                write!(f, "{}", inner)
-            }
-        }
-    }
-}
-
-impl CompatibleSourceSchema {
-    pub(crate) fn into_v2(self) -> ConnectorSchema {
-        match self {
-            CompatibleSourceSchema::RowFormat(inner) => inner.into_source_schema_v2(),
-            CompatibleSourceSchema::V2(inner) => inner,
-        }
-    }
-}
-
-impl From<ConnectorSchema> for CompatibleSourceSchema {
-    fn from(value: ConnectorSchema) -> Self {
-        Self::V2(value)
-    }
-}
-
-fn parse_source_schema(p: &mut Parser) -> Result<CompatibleSourceSchema, ParserError> {
-    if let Some(schema_v2) = p.parse_schema()? {
-        Ok(CompatibleSourceSchema::V2(schema_v2))
-    } else if p.peek_nth_any_of_keywords(0, &[Keyword::ROW])
-        && p.peek_nth_any_of_keywords(1, &[Keyword::FORMAT])
-    {
-        p.expect_keyword(Keyword::ROW)?;
-        p.expect_keyword(Keyword::FORMAT)?;
-        let id = p.parse_identifier()?;
-        let value = id.value.to_ascii_uppercase();
-        let schema = match &value[..] {
-            "JSON" => SourceSchema::Json,
-            "UPSERT_JSON" => SourceSchema::UpsertJson,
-            "PROTOBUF" => {
-                impl_parse_to!(protobuf_schema: ProtobufSchema, p);
-                SourceSchema::Protobuf(protobuf_schema)
-            }
-            "DEBEZIUM_JSON" => SourceSchema::DebeziumJson,
-            "DEBEZIUM_MONGO_JSON" => SourceSchema::DebeziumMongoJson,
-            "AVRO" => {
-                impl_parse_to!(avro_schema: AvroSchema, p);
-                SourceSchema::Avro(avro_schema)
-            }
-            "UPSERT_AVRO" => {
-                impl_parse_to!(avro_schema: AvroSchema, p);
-                SourceSchema::UpsertAvro(avro_schema)
-            }
-            "MAXWELL" => SourceSchema::Maxwell,
-            "CANAL_JSON" => SourceSchema::CanalJson,
-            "CSV" => {
-                impl_parse_to!(csv_info: CsvInfo, p);
-                SourceSchema::Csv(csv_info)
-            }
-            "NATIVE" => SourceSchema::Native, // used internally by schema change
-            "DEBEZIUM_AVRO" => {
-                impl_parse_to!(avro_schema: DebeziumAvroSchema, p);
-                SourceSchema::DebeziumAvro(avro_schema)
-            }
-            "BYTES" => SourceSchema::Bytes,
-            _ => {
-                return Err(ParserError::ParserError(
-                    "expected JSON | UPSERT_JSON | PROTOBUF | DEBEZIUM_JSON | DEBEZIUM_AVRO \
-                    | AVRO | UPSERT_AVRO | MAXWELL | CANAL_JSON | BYTES | NATIVE after ROW FORMAT"
-                        .to_string(),
-                ))
-            }
-        };
-        Ok(CompatibleSourceSchema::RowFormat(schema))
-    } else {
-        Err(ParserError::ParserError(
-            "expect description of the format".to_string(),
-        ))
-    }
-}
-
 impl Parser {
     /// Peek the next tokens to see if it is `FORMAT` or `ROW FORMAT` (for compatibility).
     fn peek_source_schema_format(&mut self) -> bool {
@@ -554,169 +318,6 @@ impl fmt::Display for ConnectorSchema {
     }
 }
 
-// sql_grammar!(ProtobufSchema {
-//     [Keyword::MESSAGE],
-//     message_name: AstString,
-//     [Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION],
-//     row_schema_location: AstString,
-// });
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
-pub struct ProtobufSchema {
-    pub message_name: AstString,
-    pub row_schema_location: AstString,
-    pub use_schema_registry: bool,
-}
-
-impl ParseTo for ProtobufSchema {
-    fn parse_to(p: &mut Parser) -> Result<Self, ParserError> {
-        impl_parse_to!([Keyword::MESSAGE], p);
-        impl_parse_to!(message_name: AstString, p);
-        impl_parse_to!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], p);
-        impl_parse_to!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], p);
-        impl_parse_to!(row_schema_location: AstString, p);
-        Ok(Self {
-            message_name,
-            row_schema_location,
-            use_schema_registry,
-        })
-    }
-}
-
-impl fmt::Display for ProtobufSchema {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let mut v: Vec<String> = vec![];
-        impl_fmt_display!([Keyword::MESSAGE], v);
-        impl_fmt_display!(message_name, v, self);
-        impl_fmt_display!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], v);
-        impl_fmt_display!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], v, self);
-        impl_fmt_display!(row_schema_location, v, self);
-        v.iter().join(" ").fmt(f)
-    }
-}
-
-// sql_grammar!(AvroSchema {
-//     [Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION, [Keyword::CONFLUENT, Keyword::SCHEMA,
-// Keyword::REGISTRY]],     row_schema_location: AstString,
-// });
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
-pub struct AvroSchema {
-    pub row_schema_location: AstString,
-    pub use_schema_registry: bool,
-}
-impl ParseTo for AvroSchema {
-    fn parse_to(p: &mut Parser) -> Result<Self, ParserError> {
-        impl_parse_to!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], p);
-        impl_parse_to!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], p);
-        impl_parse_to!(row_schema_location: AstString, p);
-        Ok(Self {
-            row_schema_location,
-            use_schema_registry,
-        })
-    }
-}
-
-impl fmt::Display for AvroSchema {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let mut v: Vec<String> = vec![];
-        impl_fmt_display!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], v);
-        impl_fmt_display!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], v, self);
-        impl_fmt_display!(row_schema_location, v, self);
-        v.iter().join(" ").fmt(f)
-    }
-}
-
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
-pub struct DebeziumAvroSchema {
-    pub row_schema_location: AstString,
-}
-
-impl fmt::Display for DebeziumAvroSchema {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let mut v: Vec<String> = vec![];
-        impl_fmt_display!(
-            [
-                Keyword::ROW,
-                Keyword::SCHEMA,
-                Keyword::LOCATION,
-                Keyword::CONFLUENT,
-                Keyword::SCHEMA,
-                Keyword::REGISTRY
-            ],
-            v
-        );
-        impl_fmt_display!(row_schema_location, v, self);
-        v.iter().join(" ").fmt(f)
-    }
-}
-
-impl ParseTo for DebeziumAvroSchema {
-    fn parse_to(p: &mut Parser) -> Result<Self, ParserError> {
-        impl_parse_to!(
-            [
-                Keyword::ROW,
-                Keyword::SCHEMA,
-                Keyword::LOCATION,
-                Keyword::CONFLUENT,
-                Keyword::SCHEMA,
-                Keyword::REGISTRY
-            ],
-            p
-        );
-        impl_parse_to!(row_schema_location: AstString, p);
-        Ok(Self {
-            row_schema_location,
-        })
-    }
-}
-
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
-pub struct CsvInfo {
-    pub delimiter: u8,
-    pub has_header: bool,
-}
-
-pub fn get_delimiter(chars: &str) -> Result<u8, ParserError> {
-    match chars {
-        "," => Ok(b','),   // comma
-        "\t" => Ok(b'\t'), // tab
-        other => Err(ParserError::ParserError(format!(
-            "The delimiter should be one of ',', E'\\t', but got {:?}",
-            other
-        ))),
-    }
-}
-
-impl ParseTo for CsvInfo {
-    fn parse_to(p: &mut Parser) -> Result<Self, ParserError> {
-        impl_parse_to!(without_header => [Keyword::WITHOUT, Keyword::HEADER], p);
-        impl_parse_to!([Keyword::DELIMITED, Keyword::BY], p);
-        impl_parse_to!(delimiter: AstString, p);
-        let delimiter = get_delimiter(delimiter.0.as_str())?;
-        Ok(Self {
-            delimiter,
-            has_header: !without_header,
-        })
-    }
-}
-
-impl fmt::Display for CsvInfo {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let mut v: Vec<String> = vec![];
-        if !self.has_header {
-            v.push(format!(
-                "{}",
-                AstVec([Keyword::WITHOUT, Keyword::HEADER].to_vec())
-            ));
-        }
-        impl_fmt_display!(delimiter, v, self);
-        v.iter().join(" ").fmt(f)
-    }
-}
-
 impl ParseTo for CreateSourceStatement {
     fn parse_to(p: &mut Parser) -> Result<Self, ParserError> {
         impl_parse_to!(if_not_exists => [Keyword::IF, Keyword::NOT, Keyword::EXISTS], p);
diff --git a/src/sqlparser/src/parser.rs b/src/sqlparser/src/parser.rs
index fcd3a3dff3e36..4eb80286cf8e0 100644
--- a/src/sqlparser/src/parser.rs
+++ b/src/sqlparser/src/parser.rs
@@ -2471,17 +2471,12 @@ impl Parser {
 
         let cdc_table_info = if self.parse_keyword(Keyword::FROM) {
             let source_name = self.parse_object_name()?;
-            if self.parse_keyword(Keyword::TABLE) {
-                let external_table_name = self.parse_literal_string()?;
-                Some(CdcTableInfo {
-                    source_name,
-                    external_table_name,
-                })
-            } else {
-                return Err(ParserError::ParserError(
-                    "Expect a TABLE clause on table created by CREATE TABLE FROM".to_string(),
-                ));
-            }
+            self.expect_keyword(Keyword::TABLE)?;
+            let external_table_name = self.parse_literal_string()?;
+            Some(CdcTableInfo {
+                source_name,
+                external_table_name,
+            })
         } else {
             None
         };
diff --git a/src/sqlparser/tests/testdata/create.yaml b/src/sqlparser/tests/testdata/create.yaml
index dd189960e213a..4da81a4c43325 100644
--- a/src/sqlparser/tests/testdata/create.yaml
+++ b/src/sqlparser/tests/testdata/create.yaml
@@ -16,13 +16,23 @@
 - input: CREATE TABLE t (a INT, b INT) AS SELECT 1 AS b, 2 AS a
   formatted_sql: CREATE TABLE t (a INT, b INT) AS SELECT 1 AS b, 2 AS a
 - input: CREATE SOURCE src
-  error_msg: 'sql parser error: expect description of the format'
+  error_msg: |-
+    sql parser error: Expected description of the format, found: EOF at the end
+    Near "CREATE SOURCE src"
+- input: CREATE SOURCE src-a FORMAT PLAIN ENCODE JSON
+  error_msg: |-
+    sql parser error: Expected description of the format, found: - at line:1, column:19
+    Near "CREATE SOURCE src"
 - input: CREATE SOURCE src FORMAT PLAIN ENCODE JSON
   formatted_sql: CREATE SOURCE src FORMAT PLAIN ENCODE JSON
 - input: CREATE SOURCE mysql_src with ( connector = 'mysql-cdc', hostname = 'localhost', port = '3306', database.name = 'mytest', server.id = '5601' )
   formatted_sql: CREATE SOURCE mysql_src WITH (connector = 'mysql-cdc', hostname = 'localhost', port = '3306', database.name = 'mytest', server.id = '5601') FORMAT PLAIN ENCODE JSON
 - input: CREATE TABLE sbtest10 (id INT PRIMARY KEY, k INT, c CHARACTER VARYING, pad CHARACTER VARYING) FROM sbtest TABLE 'mydb.sbtest10'
   formatted_sql: CREATE TABLE sbtest10 (id INT PRIMARY KEY, k INT, c CHARACTER VARYING, pad CHARACTER VARYING) FROM sbtest TABLE 'mydb.sbtest10'
+- input: CREATE TABLE sbtest10 (id INT PRIMARY KEY, k INT, c CHARACTER VARYING, pad CHARACTER VARYING) FROM sbtest
+  error_msg: |-
+    sql parser error: Expected TABLE, found: EOF at the end
+    Near "pad CHARACTER VARYING) FROM sbtest"
 - input: CREATE SOURCE IF NOT EXISTS src WITH (kafka.topic = 'abc', kafka.servers = 'localhost:1001') FORMAT PLAIN ENCODE PROTOBUF (message = 'Foo', schema.location = 'file://')
   formatted_sql: CREATE SOURCE IF NOT EXISTS src WITH (kafka.topic = 'abc', kafka.servers = 'localhost:1001') FORMAT PLAIN ENCODE PROTOBUF (message = 'Foo', schema.location = 'file://')
   formatted_ast: 'CreateSource { stmt: CreateSourceStatement { if_not_exists: true, columns: [], constraints: [], source_name: ObjectName([Ident { value: "src", quote_style: None }]), with_properties: WithProperties([SqlOption { name: ObjectName([Ident { value: "kafka", quote_style: None }, Ident { value: "topic", quote_style: None }]), value: SingleQuotedString("abc") }, SqlOption { name: ObjectName([Ident { value: "kafka", quote_style: None }, Ident { value: "servers", quote_style: None }]), value: SingleQuotedString("localhost:1001") }]), source_schema: V2(ConnectorSchema { format: Plain, row_encode: Protobuf, row_options: [SqlOption { name: ObjectName([Ident { value: "message", quote_style: None }]), value: SingleQuotedString("Foo") }, SqlOption { name: ObjectName([Ident { value: "schema", quote_style: None }, Ident { value: "location", quote_style: None }]), value: SingleQuotedString("file://") }] }), source_watermarks: [] } }'
diff --git a/src/storage/Cargo.toml b/src/storage/Cargo.toml
index 0c1045fac230c..07bb7fd528890 100644
--- a/src/storage/Cargo.toml
+++ b/src/storage/Cargo.toml
@@ -89,6 +89,7 @@ workspace-hack = { path = "../workspace-hack" }
 
 [dev-dependencies]
 criterion = { workspace = true, features = ["async_futures"] }
+expect-test = "1"
 moka = { version = "0.12", features = ["future"] }
 risingwave_hummock_sdk = { workspace = true, features = ["enable_test_epoch"] }
 risingwave_test_runner = { workspace = true }
diff --git a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs
index a86a31769f28f..1a85bea02b504 100644
--- a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs
+++ b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs
@@ -24,7 +24,7 @@ use risingwave_pb::hummock::hummock_version_delta::GroupDeltas;
 use risingwave_pb::hummock::{
     CompactionConfig, CompatibilityVersion, GroupConstruct, GroupDestroy, GroupMetaChange,
     GroupTableChange, HummockVersion, HummockVersionDelta, Level, LevelType, OverlappingLevel,
-    PbLevelType, SstableInfo,
+    PbLevelType, PbTableWatermarks, SstableInfo,
 };
 use tracing::warn;
 
@@ -189,6 +189,47 @@ impl HummockVersion {
             .map(|group| group.levels.len() + 1)
             .unwrap_or(0)
     }
+
+    pub fn safe_epoch_table_watermarks(
+        &self,
+        existing_table_ids: &[u32],
+    ) -> BTreeMap<u32, PbTableWatermarks> {
+        fn extract_single_table_watermark(
+            table_watermarks: &PbTableWatermarks,
+            safe_epoch: u64,
+        ) -> Option<PbTableWatermarks> {
+            if let Some(first_epoch_watermark) = table_watermarks.epoch_watermarks.first() {
+                assert!(
+                    first_epoch_watermark.epoch >= safe_epoch,
+                    "smallest epoch {} in table watermark should be at least safe epoch {}",
+                    first_epoch_watermark.epoch,
+                    safe_epoch
+                );
+                if first_epoch_watermark.epoch == safe_epoch {
+                    Some(PbTableWatermarks {
+                        epoch_watermarks: vec![first_epoch_watermark.clone()],
+                        is_ascending: table_watermarks.is_ascending,
+                    })
+                } else {
+                    None
+                }
+            } else {
+                None
+            }
+        }
+        self.table_watermarks
+            .iter()
+            .filter_map(|(table_id, table_watermarks)| {
+                let u32_table_id = *table_id as _;
+                if !existing_table_ids.contains(&u32_table_id) {
+                    None
+                } else {
+                    extract_single_table_watermark(table_watermarks, self.safe_epoch)
+                        .map(|table_watermarks| (*table_id, table_watermarks))
+                }
+            })
+            .collect()
+    }
 }
 
 pub type SstSplitInfo = (
diff --git a/src/storage/hummock_sdk/src/key.rs b/src/storage/hummock_sdk/src/key.rs
index ba2b55a5e7849..a2fb4ef99a0cc 100644
--- a/src/storage/hummock_sdk/src/key.rs
+++ b/src/storage/hummock_sdk/src/key.rs
@@ -510,7 +510,9 @@ impl<T: AsRef<[u8]>> UserKey<T> {
     }
 
     /// Encode in to a buffer.
-    pub fn encode_length_prefixed(&self, buf: &mut impl BufMut) {
+    ///
+    /// length prefixed requires 4B more than its `encoded_len()`
+    pub fn encode_length_prefixed(&self, mut buf: impl BufMut) {
         buf.put_u32(self.table_id.table_id());
         buf.put_u32(self.table_key.as_ref().len() as u32);
         buf.put_slice(self.table_key.as_ref());
diff --git a/src/storage/hummock_sdk/src/table_watermark.rs b/src/storage/hummock_sdk/src/table_watermark.rs
index db38eedd6a06d..cd427832f4aaf 100644
--- a/src/storage/hummock_sdk/src/table_watermark.rs
+++ b/src/storage/hummock_sdk/src/table_watermark.rs
@@ -156,9 +156,9 @@ impl TableWatermarks {
 }
 
 pub fn merge_multiple_new_table_watermarks(
-    table_watermarks_list: impl IntoIterator<Item = HashMap<u64, PbTableWatermarks>>,
-) -> HashMap<u64, PbTableWatermarks> {
-    let mut ret: HashMap<u64, (bool, BTreeMap<u64, PbEpochNewWatermarks>)> = HashMap::new();
+    table_watermarks_list: impl IntoIterator<Item = HashMap<u32, PbTableWatermarks>>,
+) -> HashMap<u32, PbTableWatermarks> {
+    let mut ret: HashMap<u32, (bool, BTreeMap<u64, PbEpochNewWatermarks>)> = HashMap::new();
     for table_watermarks in table_watermarks_list {
         for (table_id, new_table_watermarks) in table_watermarks {
             let epoch_watermarks = match ret.entry(table_id) {
diff --git a/src/storage/hummock_test/src/test_utils.rs b/src/storage/hummock_test/src/test_utils.rs
index 3b7d6701ed886..e06f798fc76a9 100644
--- a/src/storage/hummock_test/src/test_utils.rs
+++ b/src/storage/hummock_test/src/test_utils.rs
@@ -262,9 +262,7 @@ impl HummockTestEnv {
                 res.uncommitted_ssts,
                 res.table_watermarks
                     .into_iter()
-                    .map(|(table_id, watermark)| {
-                        (table_id.table_id as u64, watermark.to_protobuf())
-                    })
+                    .map(|(table_id, watermark)| (table_id.table_id, watermark.to_protobuf()))
                     .collect(),
             )
             .await
diff --git a/src/storage/src/hummock/compactor/compactor_runner.rs b/src/storage/src/hummock/compactor/compactor_runner.rs
index a137b1f101a6a..47443a3a6fee9 100644
--- a/src/storage/src/hummock/compactor/compactor_runner.rs
+++ b/src/storage/src/hummock/compactor/compactor_runner.rs
@@ -43,7 +43,8 @@ use crate::hummock::compactor::{
     fast_compactor_runner, CompactOutput, CompactionFilter, Compactor, CompactorContext,
 };
 use crate::hummock::iterator::{
-    Forward, ForwardMergeRangeIterator, HummockIterator, UnorderedMergeIteratorInner,
+    Forward, ForwardMergeRangeIterator, HummockIterator, SkipWatermarkIterator,
+    UnorderedMergeIteratorInner,
 };
 use crate::hummock::multi_builder::{CapacitySplitTableBuilder, TableBuilderFactory};
 use crate::hummock::value::HummockValue;
@@ -224,8 +225,14 @@ impl CompactorRunner {
                 }
             }
         }
+
+        // The `SkipWatermarkIterator` is used to handle the table watermark state cleaning introced
+        // in https://github.com/risingwavelabs/risingwave/issues/13148
         Ok((
-            UnorderedMergeIteratorInner::for_compactor(table_iters),
+            SkipWatermarkIterator::from_safe_epoch_watermarks(
+                UnorderedMergeIteratorInner::for_compactor(table_iters),
+                &self.compact_task.table_watermarks,
+            ),
             CompactionDeleteRangeIterator::new(del_iter),
         ))
     }
diff --git a/src/storage/src/hummock/iterator/skip_watermark.rs b/src/storage/src/hummock/iterator/skip_watermark.rs
index 58180ff356fc7..09644b2ab7475 100644
--- a/src/storage/src/hummock/iterator/skip_watermark.rs
+++ b/src/storage/src/hummock/iterator/skip_watermark.rs
@@ -16,10 +16,12 @@ use std::cmp::Ordering;
 use std::collections::{BTreeMap, VecDeque};
 
 use bytes::Bytes;
+use risingwave_common::buffer::Bitmap;
 use risingwave_common::catalog::TableId;
-use risingwave_common::hash::VirtualNode;
+use risingwave_common::hash::{VirtualNode, VnodeBitmapExt};
 use risingwave_hummock_sdk::key::FullKey;
 use risingwave_hummock_sdk::table_watermark::{ReadTableWatermark, WatermarkDirection};
+use risingwave_pb::hummock::PbTableWatermarks;
 
 use crate::hummock::iterator::{Forward, HummockIterator};
 use crate::hummock::value::HummockValue;
@@ -41,6 +43,51 @@ impl<I: HummockIterator<Direction = Forward>> SkipWatermarkIterator<I> {
         }
     }
 
+    pub fn from_safe_epoch_watermarks(
+        inner: I,
+        safe_epoch_watermarks: &BTreeMap<u32, PbTableWatermarks>,
+    ) -> Self {
+        let watermarks = safe_epoch_watermarks
+            .iter()
+            .map(|(table_id, watermarks)| {
+                assert_eq!(watermarks.epoch_watermarks.len(), 1);
+                let vnode_watermarks = &watermarks
+                    .epoch_watermarks
+                    .first()
+                    .expect("should exist")
+                    .watermarks;
+                let mut vnode_watermark_map = BTreeMap::new();
+                for vnode_watermark in vnode_watermarks {
+                    let watermark = Bytes::copy_from_slice(&vnode_watermark.watermark);
+                    for vnode in
+                        Bitmap::from(vnode_watermark.vnode_bitmap.as_ref().expect("should exist"))
+                            .iter_vnodes()
+                    {
+                        assert!(
+                            vnode_watermark_map
+                                .insert(vnode, watermark.clone())
+                                .is_none(),
+                            "duplicate table watermark on vnode {}",
+                            vnode.to_index()
+                        );
+                    }
+                }
+                (
+                    TableId::from(*table_id),
+                    ReadTableWatermark {
+                        direction: if watermarks.is_ascending {
+                            WatermarkDirection::Ascending
+                        } else {
+                            WatermarkDirection::Descending
+                        },
+                        vnode_watermarks: vnode_watermark_map,
+                    },
+                )
+            })
+            .collect();
+        Self::new(inner, watermarks)
+    }
+
     fn reset_watermark(&mut self) {
         self.remain_watermarks = self
             .watermarks
diff --git a/src/storage/src/hummock/sstable/mod.rs b/src/storage/src/hummock/sstable/mod.rs
index 65c38c68c3bc2..039e7962f2d7c 100644
--- a/src/storage/src/hummock/sstable/mod.rs
+++ b/src/storage/src/hummock/sstable/mod.rs
@@ -65,7 +65,6 @@ use super::{HummockError, HummockResult};
 use crate::hummock::CachePolicy;
 use crate::store::ReadOptions;
 
-const DEFAULT_META_BUFFER_CAPACITY: usize = 4096;
 const MAGIC: u32 = 0x5785ab73;
 const OLD_VERSION: u32 = 1;
 const VERSION: u32 = 2;
@@ -164,8 +163,10 @@ impl MonotonicDeleteEvent {
         }
     }
 
-    pub fn encode(&self, buf: &mut Vec<u8>) {
-        self.event_key.left_user_key.encode_length_prefixed(buf);
+    pub fn encode(&self, mut buf: impl BufMut) {
+        self.event_key
+            .left_user_key
+            .encode_length_prefixed(&mut buf);
         buf.put_u8(if self.event_key.is_exclude_left_key {
             1
         } else {
@@ -191,6 +192,7 @@ impl MonotonicDeleteEvent {
 
     #[inline]
     pub fn encoded_size(&self) -> usize {
+        // length prefixed requires 4B more than its `encoded_len()`
         4 + self.event_key.left_user_key.encoded_len() + 1 + 8
     }
 }
@@ -292,7 +294,7 @@ impl BlockMeta {
     /// ```plain
     /// | offset (4B) | len (4B) | uncompressed size (4B) | smallest key len (4B) | smallest key |
     /// ```
-    pub fn encode(&self, buf: &mut Vec<u8>) {
+    pub fn encode(&self, mut buf: impl BufMut) {
         buf.put_u32_le(self.offset);
         buf.put_u32_le(self.len);
         buf.put_u32_le(self.uncompressed_size);
@@ -389,13 +391,15 @@ impl SstableMeta {
     /// | checksum (8B) | version (4B) | magic (4B) |
     /// ```
     pub fn encode_to_bytes(&self) -> Vec<u8> {
-        let mut buf = Vec::with_capacity(DEFAULT_META_BUFFER_CAPACITY);
+        let encoded_size = self.encoded_size();
+        let mut buf = Vec::with_capacity(encoded_size);
         self.encode_to(&mut buf);
         buf
     }
 
-    pub fn encode_to(&self, buf: &mut Vec<u8>) {
-        let start_offset = buf.len();
+    pub fn encode_to(&self, mut buf: impl BufMut + AsRef<[u8]>) {
+        let start = buf.as_ref().len();
+
         buf.put_u32_le(
             utils::checked_into_u32(self.block_metas.len()).unwrap_or_else(|_| {
                 let tmp_full_key = FullKey::decode(&self.smallest_key);
@@ -407,13 +411,13 @@ impl SstableMeta {
             }),
         );
         for block_meta in &self.block_metas {
-            block_meta.encode(buf);
+            block_meta.encode(&mut buf);
         }
-        put_length_prefixed_slice(buf, &self.bloom_filter);
+        put_length_prefixed_slice(&mut buf, &self.bloom_filter);
         buf.put_u32_le(self.estimated_size);
         buf.put_u32_le(self.key_count);
-        put_length_prefixed_slice(buf, &self.smallest_key);
-        put_length_prefixed_slice(buf, &self.largest_key);
+        put_length_prefixed_slice(&mut buf, &self.smallest_key);
+        put_length_prefixed_slice(&mut buf, &self.largest_key);
         buf.put_u32_le(
             utils::checked_into_u32(self.monotonic_tombstone_events.len()).unwrap_or_else(|_| {
                 let tmp_full_key = FullKey::decode(&self.smallest_key);
@@ -425,10 +429,13 @@ impl SstableMeta {
             }),
         );
         for monotonic_tombstone_event in &self.monotonic_tombstone_events {
-            monotonic_tombstone_event.encode(buf);
+            monotonic_tombstone_event.encode(&mut buf);
         }
         buf.put_u64_le(self.meta_offset);
-        let checksum = xxhash64_checksum(&buf[start_offset..]);
+
+        let end = buf.as_ref().len();
+
+        let checksum = xxhash64_checksum(&buf.as_ref()[start..end]);
         buf.put_u64_le(checksum);
         buf.put_u32_le(VERSION);
         buf.put_u32_le(MAGIC);
diff --git a/src/storage/src/hummock/sstable/utils.rs b/src/storage/src/hummock/sstable/utils.rs
index b754b17f4a3dc..920dd2c75b611 100644
--- a/src/storage/src/hummock/sstable/utils.rs
+++ b/src/storage/src/hummock/sstable/utils.rs
@@ -71,7 +71,7 @@ pub fn xxhash64_verify(data: &[u8], checksum: u64) -> HummockResult<()> {
 
 use bytes::{Buf, BufMut};
 
-pub fn put_length_prefixed_slice(buf: &mut Vec<u8>, slice: &[u8]) {
+pub fn put_length_prefixed_slice(mut buf: impl BufMut, slice: &[u8]) {
     let len = checked_into_u32(slice.len())
         .unwrap_or_else(|_| panic!("WARN overflow can't convert slice {} into u32", slice.len()));
     buf.put_u32_le(len);
diff --git a/src/storage/src/row_serde/mod.rs b/src/storage/src/row_serde/mod.rs
index cac6fa320ea80..5fc99b8b6945a 100644
--- a/src/storage/src/row_serde/mod.rs
+++ b/src/storage/src/row_serde/mod.rs
@@ -19,7 +19,7 @@ pub mod row_serde_util;
 
 pub mod value_serde;
 
-/// Find out the [`ColumnDesc`] by a list of [`ColumnId`].
+/// Find out the [`ColumnDesc`] selected with a list of [`ColumnId`].
 ///
 /// # Returns
 ///
@@ -57,3 +57,97 @@ impl ColumnMapping {
         origin_row.project(&self.output_indices)
     }
 }
+
+#[cfg(test)]
+mod test {
+    use std::fmt::Debug;
+
+    use expect_test::{expect, Expect};
+    use risingwave_common::types::DataType;
+
+    use super::*;
+
+    fn check(actual: impl Debug, expect: Expect) {
+        let actual = format!("{:#?}", actual);
+        expect.assert_eq(&actual);
+    }
+
+    #[test]
+    fn test_find_columns_by_ids() {
+        let table_columns = vec![
+            ColumnDesc::unnamed(1.into(), DataType::Varchar),
+            ColumnDesc::unnamed(2.into(), DataType::Int64),
+            ColumnDesc::unnamed(3.into(), DataType::Int16),
+        ];
+        let column_ids = vec![2.into(), 3.into()];
+        let result = find_columns_by_ids(&table_columns, &column_ids);
+        check(
+            result,
+            expect![[r#"
+            (
+                [
+                    ColumnDesc {
+                        data_type: Int64,
+                        column_id: #2,
+                        name: "",
+                        field_descs: [],
+                        type_name: "",
+                        generated_or_default_column: None,
+                        description: None,
+                    },
+                    ColumnDesc {
+                        data_type: Int16,
+                        column_id: #3,
+                        name: "",
+                        field_descs: [],
+                        type_name: "",
+                        generated_or_default_column: None,
+                        description: None,
+                    },
+                ],
+                [
+                    1,
+                    2,
+                ],
+            )"#]],
+        );
+
+        let table_columns = vec![
+            ColumnDesc::unnamed(2.into(), DataType::Int64),
+            ColumnDesc::unnamed(1.into(), DataType::Varchar),
+            ColumnDesc::unnamed(3.into(), DataType::Int16),
+        ];
+        let column_ids = vec![2.into(), 1.into()];
+        let result = find_columns_by_ids(&table_columns, &column_ids);
+        check(
+            result,
+            expect![[r#"
+                (
+                    [
+                        ColumnDesc {
+                            data_type: Int64,
+                            column_id: #2,
+                            name: "",
+                            field_descs: [],
+                            type_name: "",
+                            generated_or_default_column: None,
+                            description: None,
+                        },
+                        ColumnDesc {
+                            data_type: Varchar,
+                            column_id: #1,
+                            name: "",
+                            field_descs: [],
+                            type_name: "",
+                            generated_or_default_column: None,
+                            description: None,
+                        },
+                    ],
+                    [
+                        0,
+                        1,
+                    ],
+                )"#]],
+        );
+    }
+}
diff --git a/src/storage/src/table/batch_table/storage_table.rs b/src/storage/src/table/batch_table/storage_table.rs
index 6edbf68ec7427..24d64076aa9ac 100644
--- a/src/storage/src/table/batch_table/storage_table.rs
+++ b/src/storage/src/table/batch_table/storage_table.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::default::Default;
 use std::ops::Bound::{self, Excluded, Included, Unbounded};
 use std::ops::{Index, RangeBounds};
 use std::sync::Arc;
@@ -117,14 +118,21 @@ impl<S: StateStore, SD: ValueRowSerde> std::fmt::Debug for StorageTableInner<S,
 // init
 impl<S: StateStore> StorageTableInner<S, EitherSerde> {
     /// Create a  [`StorageTableInner`] given a complete set of `columns` and a partial
-    /// set of `column_ids`. The output will only contains columns with the given ids in the same
-    /// order.
+    /// set of `output_column_ids`.
+    /// When reading from the storage table,
+    /// the chunks or rows will only contain columns with the given ids (`output_column_ids`).
+    /// They will in the same order as the given `output_column_ids`.
+    ///
+    /// NOTE(kwannoel): The `output_column_ids` here may be slightly different
+    /// from those supplied to associated executors.
+    /// These `output_column_ids` may have `pk` appended, since they will be needed to scan from
+    /// storage. The associated executors may not have these `pk` fields.
     #[allow(clippy::too_many_arguments)]
     pub fn new_partial(
         store: S,
         table_id: TableId,
         table_columns: Vec<ColumnDesc>,
-        column_ids: Vec<ColumnId>,
+        output_column_ids: Vec<ColumnId>,
         order_types: Vec<OrderType>,
         pk_indices: Vec<usize>,
         distribution: Distribution,
@@ -137,7 +145,7 @@ impl<S: StateStore> StorageTableInner<S, EitherSerde> {
             store,
             table_id,
             table_columns,
-            column_ids,
+            output_column_ids,
             order_types,
             pk_indices,
             distribution,
@@ -156,12 +164,12 @@ impl<S: StateStore> StorageTableInner<S, EitherSerde> {
         pk_indices: Vec<usize>,
         value_indices: Vec<usize>,
     ) -> Self {
-        let column_ids = columns.iter().map(|c| c.column_id).collect();
+        let output_column_ids = columns.iter().map(|c| c.column_id).collect();
         Self::new_inner(
             store,
             table_id,
             columns,
-            column_ids,
+            output_column_ids,
             order_types,
             pk_indices,
             Distribution::fallback(),
@@ -177,7 +185,7 @@ impl<S: StateStore> StorageTableInner<S, EitherSerde> {
         store: S,
         table_id: TableId,
         table_columns: Vec<ColumnDesc>,
-        column_ids: Vec<ColumnId>,
+        output_column_ids: Vec<ColumnId>,
         order_types: Vec<OrderType>,
         pk_indices: Vec<usize>,
         Distribution {
@@ -191,7 +199,8 @@ impl<S: StateStore> StorageTableInner<S, EitherSerde> {
     ) -> Self {
         assert_eq!(order_types.len(), pk_indices.len());
 
-        let (output_columns, output_indices) = find_columns_by_ids(&table_columns, &column_ids);
+        let (output_columns, output_indices) =
+            find_columns_by_ids(&table_columns, &output_column_ids);
         let mut value_output_indices = vec![];
         let mut key_output_indices = vec![];
 
diff --git a/src/storage/src/table/mod.rs b/src/storage/src/table/mod.rs
index b6407528d5272..e22b154ccfc93 100644
--- a/src/storage/src/table/mod.rs
+++ b/src/storage/src/table/mod.rs
@@ -128,12 +128,13 @@ where
 }
 
 /// Collects data chunks from stream of rows.
-pub async fn collect_data_chunk_with_builder<E, S>(
+pub async fn collect_data_chunk_with_builder<E, S, R>(
     stream: &mut S,
     builder: &mut DataChunkBuilder,
 ) -> Result<Option<DataChunk>, E>
 where
-    S: Stream<Item = Result<OwnedRow, E>> + Unpin,
+    R: Row,
+    S: Stream<Item = Result<R, E>> + Unpin,
 {
     // TODO(kwannoel): If necessary, we can optimize it in the future.
     // This can be done by moving the check if builder is full from `append_one_row` to here,
@@ -206,6 +207,7 @@ fn check_vnode_is_set(vnode: VirtualNode, vnodes: &Bitmap) {
     );
 }
 
+#[derive(Debug)]
 pub struct KeyedRow<T: AsRef<[u8]>> {
     vnode_prefixed_key: TableKey<T>,
     row: OwnedRow,
@@ -230,6 +232,10 @@ impl<T: AsRef<[u8]>> KeyedRow<T> {
     pub fn key(&self) -> &[u8] {
         self.vnode_prefixed_key.key_part()
     }
+
+    pub fn into_parts(self) -> (TableKey<T>, OwnedRow) {
+        (self.vnode_prefixed_key, self.row)
+    }
 }
 
 impl<T: AsRef<[u8]>> Deref for KeyedRow<T> {
diff --git a/src/stream/src/common/builder.rs b/src/stream/src/common/builder.rs
index 947a79f3747c9..6180b2cd69163 100644
--- a/src/stream/src/common/builder.rs
+++ b/src/stream/src/common/builder.rs
@@ -12,136 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use risingwave_common::array::stream_record::Record;
-use risingwave_common::array::{ArrayBuilderImpl, Op, StreamChunk};
+// Re-export `StreamChunkBuilder`.
+pub use risingwave_common::array::stream_chunk_builder::StreamChunkBuilder;
+use risingwave_common::array::{Op, StreamChunk};
 use risingwave_common::row::Row;
 use risingwave_common::types::{DataType, DatumRef};
-use risingwave_common::util::iter_util::ZipEqFast;
-
-/// Build stream chunks with fixed chunk size from rows or records.
-pub struct StreamChunkBuilder {
-    /// operations in the data chunk to build
-    ops: Vec<Op>,
-
-    /// arrays in the data chunk to build
-    column_builders: Vec<ArrayBuilderImpl>,
-
-    /// Data types of columns
-    data_types: Vec<DataType>,
-
-    /// Maximum capacity of column builder
-    capacity: usize,
-
-    /// Size of column builder
-    size: usize,
-}
-
-impl Drop for StreamChunkBuilder {
-    fn drop(&mut self) {
-        // Possible to fail when async task gets cancelled.
-        if self.size != 0 {
-            tracing::warn!(
-                remaining = self.size,
-                "dropping non-empty stream chunk builder"
-            );
-        }
-    }
-}
-
-impl StreamChunkBuilder {
-    pub fn new(chunk_size: usize, data_types: Vec<DataType>) -> Self {
-        assert!(chunk_size > 0);
-
-        let ops = Vec::with_capacity(chunk_size);
-        let column_builders = data_types
-            .iter()
-            .map(|datatype| datatype.create_array_builder(chunk_size))
-            .collect();
-        Self {
-            ops,
-            column_builders,
-            data_types,
-            capacity: chunk_size,
-            size: 0,
-        }
-    }
-
-    /// Increase chunk size
-    ///
-    /// A [`StreamChunk`] will be returned when `size == capacity`
-    #[must_use]
-    fn inc_size(&mut self) -> Option<StreamChunk> {
-        self.size += 1;
-
-        // Take a chunk when capacity is exceeded. Splitting `UpdateDelete` and `UpdateInsert`
-        // should be avoided, so when the last one is `UpdateDelete`, we delay the chunk until
-        // `UpdateInsert` comes. This means the output chunk size may exceed the given `chunk_size`,
-        // and theoretically at most `chunk_size + 1` if inputs are consistent.
-        if self.size >= self.capacity && self.ops[self.ops.len() - 1] != Op::UpdateDelete {
-            self.take()
-        } else {
-            None
-        }
-    }
-
-    /// Append an iterator of output index and datum to the builder, return a chunk if the builder
-    /// is full.
-    /// Note: the caller must ensure that each column occurs exactly once in `iter`.
-    fn append_iter<'a>(
-        &mut self,
-        op: Op,
-        iter: impl IntoIterator<Item = (usize, DatumRef<'a>)>,
-    ) -> Option<StreamChunk> {
-        self.ops.push(op);
-        for (i, datum) in iter {
-            self.column_builders[i].append(datum);
-        }
-        self.inc_size()
-    }
-
-    /// Append a row to the builder, return a chunk if the builder is full.
-    #[must_use]
-    pub fn append_row(&mut self, op: Op, row: impl Row) -> Option<StreamChunk> {
-        self.append_iter(op, row.iter().enumerate())
-    }
-
-    /// Append a record to the builder, return a chunk if the builder is full.
-    #[must_use]
-    pub fn append_record(&mut self, record: Record<impl Row>) -> Option<StreamChunk> {
-        match record {
-            Record::Insert { new_row } => self.append_row(Op::Insert, new_row),
-            Record::Delete { old_row } => self.append_row(Op::Delete, old_row),
-            Record::Update { old_row, new_row } => {
-                let none = self.append_row(Op::UpdateDelete, old_row);
-                debug_assert!(none.is_none());
-                self.append_row(Op::UpdateInsert, new_row)
-            }
-        }
-    }
-
-    #[must_use]
-    pub fn take(&mut self) -> Option<StreamChunk> {
-        if self.size == 0 {
-            return None;
-        }
-
-        self.size = 0;
-        let new_columns = self
-            .column_builders
-            .iter_mut()
-            .zip_eq_fast(&self.data_types)
-            .map(|(builder, datatype)| {
-                std::mem::replace(builder, datatype.create_array_builder(self.capacity)).finish()
-            })
-            .map(Into::into)
-            .collect::<Vec<_>>();
-
-        Some(StreamChunk::new(
-            std::mem::replace(&mut self.ops, Vec::with_capacity(self.capacity)),
-            new_columns,
-        ))
-    }
-}
 
 type IndexMappings = Vec<(usize, usize)>;
 
diff --git a/src/stream/src/common/table/state_table.rs b/src/stream/src/common/table/state_table.rs
index bfdd50f883fd7..b2b172cc79858 100644
--- a/src/stream/src/common/table/state_table.rs
+++ b/src/stream/src/common/table/state_table.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::default::Default;
 use std::ops::Bound;
 use std::ops::Bound::*;
 use std::sync::Arc;
@@ -25,10 +26,12 @@ use risingwave_common::array::stream_record::Record;
 use risingwave_common::array::{Op, StreamChunk};
 use risingwave_common::buffer::Bitmap;
 use risingwave_common::cache::CachePriority;
-use risingwave_common::catalog::{get_dist_key_in_pk_indices, ColumnDesc, TableId, TableOption};
+use risingwave_common::catalog::{
+    get_dist_key_in_pk_indices, ColumnDesc, ColumnId, TableId, TableOption,
+};
 use risingwave_common::hash::{VirtualNode, VnodeBitmapExt};
 use risingwave_common::row::{self, once, CompactedRow, Once, OwnedRow, Row, RowExt};
-use risingwave_common::types::{Datum, DefaultOrd, DefaultOrdered, ScalarImpl};
+use risingwave_common::types::{DataType, Datum, DefaultOrd, DefaultOrdered, ScalarImpl};
 use risingwave_common::util::epoch::EpochPair;
 use risingwave_common::util::iter_util::{ZipEqDebug, ZipEqFast};
 use risingwave_common::util::row_serde::OrderedRowSerde;
@@ -42,6 +45,7 @@ use risingwave_pb::catalog::Table;
 use risingwave_storage::error::{ErrorKind, StorageError, StorageResult};
 use risingwave_storage::hummock::CachePolicy;
 use risingwave_storage::mem_table::MemTableError;
+use risingwave_storage::row_serde::find_columns_by_ids;
 use risingwave_storage::row_serde::row_serde_util::{
     deserialize_pk_with_vnode, serialize_pk, serialize_pk_with_vnode,
 };
@@ -138,13 +142,23 @@ pub struct StateTableInner<
 
     /// Watermark cache
     watermark_cache: StateTableWatermarkCache,
+
+    /// Data Types
+    /// We will need to use to build data chunks from state table rows.
+    data_types: Vec<DataType>,
+
+    /// Output indices
+    /// Used for:
+    /// 1. Computing output_value_indices to ser/de replicated rows.
+    /// 2. Computing output pk indices to used them for backfill state.
+    output_indices: Vec<usize>,
 }
 
 /// `StateTable` will use `BasicSerde` as default
 pub type StateTable<S> = StateTableInner<S, BasicSerde>;
 /// `ReplicatedStateTable` is meant to replicate upstream shared buffer.
 /// Used for `ArrangementBackfill` executor.
-pub type ReplicatedStateTable<S> = StateTableInner<S, BasicSerde, true>;
+pub type ReplicatedStateTable<S, SD> = StateTableInner<S, SD, true>;
 /// `WatermarkCacheStateTable` caches the watermark column.
 /// It will reduce state cleaning overhead.
 pub type WatermarkCacheStateTable<S> =
@@ -202,7 +216,7 @@ where
         store: S,
         vnodes: Option<Arc<Bitmap>>,
     ) -> Self {
-        Self::from_table_catalog_inner(table_catalog, store, vnodes, true).await
+        Self::from_table_catalog_inner(table_catalog, store, vnodes, true, vec![]).await
     }
 
     /// Create state table from table catalog and store with sanity check disabled.
@@ -211,7 +225,7 @@ where
         store: S,
         vnodes: Option<Arc<Bitmap>>,
     ) -> Self {
-        Self::from_table_catalog_inner(table_catalog, store, vnodes, false).await
+        Self::from_table_catalog_inner(table_catalog, store, vnodes, false, vec![]).await
     }
 
     /// Create state table from table catalog and store.
@@ -220,6 +234,7 @@ where
         store: S,
         vnodes: Option<Arc<Bitmap>>,
         is_consistent_op: bool,
+        output_indices: Vec<usize>,
     ) -> Self {
         let table_id = TableId::new(table_catalog.id);
         let table_columns: Vec<ColumnDesc> = table_catalog
@@ -227,6 +242,17 @@ where
             .iter()
             .map(|col| col.column_desc.as_ref().unwrap().into())
             .collect();
+        let data_types: Vec<DataType> = table_catalog
+            .columns
+            .iter()
+            .map(|col| {
+                col.get_column_desc()
+                    .unwrap()
+                    .get_column_type()
+                    .unwrap()
+                    .into()
+            })
+            .collect();
         let order_types: Vec<OrderType> = table_catalog
             .pk
             .iter()
@@ -299,9 +325,15 @@ where
             Arc::from_iter(table_catalog.value_indices.iter().map(|val| *val as usize)),
             Arc::from(table_columns.into_boxed_slice()),
         );
+
+        // If state table has versioning, that means it supports
+        // Schema change. In that case, the row encoding should be column aware as well.
+        // Otherwise both will be false.
+        // NOTE(kwannoel): Replicated table will follow upstream table's versioning. I'm not sure
+        // If ALTER TABLE will propagate to this replicated table as well. Ideally it won't
         assert_eq!(
-            row_serde.kind().is_column_aware(),
-            table_catalog.version.is_some()
+            table_catalog.version.is_some(),
+            row_serde.kind().is_column_aware()
         );
 
         let watermark_cache = if USE_WATERMARK_CACHE {
@@ -326,6 +358,8 @@ where
             state_clean_watermark: None,
             prev_cleaned_watermark: None,
             watermark_cache,
+            data_types,
+            output_indices,
         }
     }
 
@@ -458,7 +492,10 @@ where
                 TableOption::default(),
             ))
             .await;
-
+        let data_types: Vec<DataType> = table_columns
+            .iter()
+            .map(|col| col.data_type.clone())
+            .collect();
         let pk_data_types = pk_indices
             .iter()
             .map(|i| table_columns[*i].data_type.clone())
@@ -470,7 +507,6 @@ where
         } else {
             StateTableWatermarkCache::new(0)
         };
-
         Self {
             table_id,
             local_store: local_state_store,
@@ -495,9 +531,15 @@ where
             state_clean_watermark: None,
             prev_cleaned_watermark: None,
             watermark_cache,
+            data_types,
+            output_indices: vec![],
         }
     }
 
+    pub fn get_data_types(&self) -> &[DataType] {
+        &self.data_types
+    }
+
     pub fn table_id(&self) -> u32 {
         self.table_id.table_id
     }
@@ -541,11 +583,23 @@ where
         compute_vnode(pk, &self.dist_key_in_pk_indices, &self.vnodes)
     }
 
-    // TODO: remove, should not be exposed to user
+    /// NOTE(kwannoel): This is used by backfill.
+    /// We want to check pk indices of upstream table.
     pub fn pk_indices(&self) -> &[usize] {
         &self.pk_indices
     }
 
+    /// Get the indices of the primary key columns in the output columns.
+    ///
+    /// Returns `None` if any of the primary key columns is not in the output columns.
+    pub fn pk_in_output_indices(&self) -> Option<Vec<usize>> {
+        assert!(IS_REPLICATED);
+        self.pk_indices
+            .iter()
+            .map(|&i| self.output_indices.iter().position(|&j| i == j))
+            .collect()
+    }
+
     pub fn pk_serde(&self) -> &OrderedRowSerde {
         &self.pk_serde
     }
@@ -571,6 +625,29 @@ where
     }
 }
 
+impl<S, SD, W, const USE_WATERMARK_CACHE: bool> StateTableInner<S, SD, true, W, USE_WATERMARK_CACHE>
+where
+    S: StateStore,
+    SD: ValueRowSerde,
+    W: WatermarkBufferStrategy,
+{
+    /// Create replicated state table from table catalog with output indices
+    pub async fn from_table_catalog_with_output_column_ids(
+        table_catalog: &Table,
+        store: S,
+        vnodes: Option<Arc<Bitmap>>,
+        output_column_ids: Vec<ColumnId>,
+    ) -> Self {
+        let columns = table_catalog
+            .columns
+            .iter()
+            .map(|c| c.column_desc.as_ref().unwrap().into())
+            .collect_vec();
+        let (_, output_indices) = find_columns_by_ids(&columns[..], &output_column_ids);
+        Self::from_table_catalog_inner(table_catalog, store, vnodes, false, output_indices).await
+    }
+}
+
 // point get
 impl<
         S,
@@ -589,7 +666,14 @@ where
         match encoded_row {
             Some(encoded_row) => {
                 let row = self.row_serde.deserialize(&encoded_row)?;
-                Ok(Some(OwnedRow::new(row)))
+                if IS_REPLICATED {
+                    // If the table is replicated, we need to deserialize the row with the output
+                    // indices.
+                    let row = row.project(&self.output_indices);
+                    Ok(Some(row.into_owned_row()))
+                } else {
+                    Ok(Some(OwnedRow::new(row)))
+                }
             }
             None => Ok(None),
         }
@@ -1131,6 +1215,25 @@ where
         ))
     }
 
+    pub async fn iter_with_vnode_and_output_indices(
+        &self,
+        vnode: VirtualNode,
+        pk_range: &(Bound<impl Row>, Bound<impl Row>),
+        prefetch_options: PrefetchOptions,
+    ) -> StreamExecutorResult<impl Stream<Item = StreamExecutorResult<KeyedRow<Bytes>>> + '_> {
+        assert!(IS_REPLICATED);
+        let stream = self
+            .iter_with_vnode(vnode, pk_range, prefetch_options)
+            .await?;
+        Ok(stream.map(|row| {
+            row.map(|keyed_row| {
+                let (vnode_prefixed_key, row) = keyed_row.into_parts();
+                let row = row.project(&self.output_indices).into_owned_row();
+                KeyedRow::new(vnode_prefixed_key, row)
+            })
+        }))
+    }
+
     async fn iter_kv(
         &self,
         key_range: (Bound<Bytes>, Bound<Bytes>),
@@ -1139,12 +1242,11 @@ where
     ) -> StreamExecutorResult<<S::Local as LocalStateStore>::IterStream<'_>> {
         let read_options = ReadOptions {
             prefix_hint,
-            ignore_range_tombstone: false,
             retention_seconds: self.table_option.retention_seconds,
             table_id: self.table_id,
-            read_version_from_backup: false,
             prefetch_options,
             cache_policy: CachePolicy::Fill(CachePriority::High),
+            ..Default::default()
         };
         let table_key_range = map_table_key_range(key_range);
 
@@ -1225,7 +1327,6 @@ where
         prefetch_options: PrefetchOptions,
     ) -> StreamExecutorResult<<S::Local as LocalStateStore>::IterStream<'_>> {
         let memcomparable_range = prefix_range_to_memcomparable(&self.pk_serde, pk_range);
-
         let memcomparable_range_with_vnode = prefixed_range_with_vnode(memcomparable_range, vnode);
 
         // TODO: provide a trace of useful params.
@@ -1273,12 +1374,9 @@ where
 
         let read_options = ReadOptions {
             prefix_hint,
-            ignore_range_tombstone: false,
-            retention_seconds: None,
             table_id: self.table_id,
-            read_version_from_backup: false,
-            prefetch_options: Default::default(),
             cache_policy: CachePolicy::Fill(CachePriority::High),
+            ..Default::default()
         };
 
         self.local_store
diff --git a/src/stream/src/executor/agg_common.rs b/src/stream/src/executor/agg_common.rs
index b1feac670d942..6df0e58c6ace9 100644
--- a/src/stream/src/executor/agg_common.rs
+++ b/src/stream/src/executor/agg_common.rs
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 use std::collections::HashMap;
-use std::sync::Arc;
 
 use risingwave_expr::aggregate::AggCall;
 use risingwave_pb::stream_plan::PbAggNodeVersion;
@@ -22,7 +21,6 @@ use risingwave_storage::StateStore;
 use super::aggregation::AggStateStorage;
 use super::{Executor, ExecutorInfo};
 use crate::common::table::state_table::StateTable;
-use crate::executor::monitor::StreamingMetrics;
 use crate::executor::ActorContextRef;
 use crate::task::AtomicU64Ref;
 
@@ -45,7 +43,7 @@ pub struct AggExecutorArgs<S: StateStore, E: AggExecutorExtraArgs> {
     pub intermediate_state_table: StateTable<S>,
     pub distinct_dedup_tables: HashMap<usize, StateTable<S>>,
     pub watermark_epoch: AtomicU64Ref,
-    pub metrics: Arc<StreamingMetrics>,
+
     // extra
     pub extra: E,
 }
diff --git a/src/stream/src/executor/aggregation/distinct.rs b/src/stream/src/executor/aggregation/distinct.rs
index 9e1d8d66da848..079ddf8661ae0 100644
--- a/src/stream/src/executor/aggregation/distinct.rs
+++ b/src/stream/src/executor/aggregation/distinct.rs
@@ -29,24 +29,20 @@ use super::{AggCall, GroupKey};
 use crate::cache::{new_unbounded, ManagedLruCache};
 use crate::common::metrics::MetricsInfo;
 use crate::common::table::state_table::StateTable;
-use crate::executor::monitor::StreamingMetrics;
 use crate::executor::{ActorContextRef, StreamExecutorResult};
-use crate::task::ActorId;
 
 type DedupCache = ManagedLruCache<CompactedRow, Box<[i64]>>;
 
 /// Deduplicater for one distinct column.
 struct ColumnDeduplicater<S: StateStore> {
     cache: DedupCache,
-    metrics_info: MetricsInfo,
     _phantom: PhantomData<S>,
 }
 
 impl<S: StateStore> ColumnDeduplicater<S> {
-    fn new(watermark_epoch: &Arc<AtomicU64>, metrics_info: MetricsInfo) -> Self {
+    fn new(watermark_epoch: Arc<AtomicU64>, metrics_info: MetricsInfo) -> Self {
         Self {
-            cache: new_unbounded(watermark_epoch.clone(), metrics_info.clone()),
-            metrics_info,
+            cache: new_unbounded(watermark_epoch, metrics_info),
             _phantom: PhantomData,
         }
     }
@@ -83,8 +79,7 @@ impl<S: StateStore> ColumnDeduplicater<S> {
             let cache_key =
                 CompactedRow::from(group_key.map(GroupKey::cache_key).chain(row::once(datum)));
 
-            self.metrics_info
-                .metrics
+            ctx.streaming_metrics
                 .agg_distinct_total_cache_count
                 .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str])
                 .inc();
@@ -93,8 +88,7 @@ impl<S: StateStore> ColumnDeduplicater<S> {
             let mut counts = if self.cache.contains(&cache_key) {
                 self.cache.get_mut(&cache_key).unwrap()
             } else {
-                self.metrics_info
-                    .metrics
+                ctx.streaming_metrics
                     .agg_distinct_cache_miss_count
                     .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str])
                     .inc();
@@ -190,15 +184,15 @@ impl<S: StateStore> ColumnDeduplicater<S> {
         // TODO(rc): now we flush the table in `dedup` method.
         // WARN: if you want to change to batching the write to table. please remember to change
         // `self.cache.evict()` too.
+        self.cache.evict();
+
         let actor_id_str = ctx.id.to_string();
         let fragment_id_str = ctx.fragment_id.to_string();
         let table_id_str = dedup_table.table_id().to_string();
-        self.metrics_info
-            .metrics
+        ctx.streaming_metrics
             .agg_distinct_cached_entry_count
             .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str])
             .set(self.cache.len() as i64);
-        self.cache.evict();
     }
 }
 
@@ -218,16 +212,17 @@ pub struct DistinctDeduplicater<S: StateStore> {
     /// Key: distinct column index;
     /// Value: (agg call indices that distinct on the column, deduplicater for the column).
     deduplicaters: HashMap<usize, (Box<[usize]>, ColumnDeduplicater<S>)>,
+    ctx: ActorContextRef,
 }
 
 impl<S: StateStore> DistinctDeduplicater<S> {
     pub fn new(
         agg_calls: &[AggCall],
-        watermark_epoch: &Arc<AtomicU64>,
+        watermark_epoch: Arc<AtomicU64>,
         distinct_dedup_tables: &HashMap<usize, StateTable<S>>,
-        actor_id: ActorId,
-        metrics: Arc<StreamingMetrics>,
+        ctx: ActorContextRef,
     ) -> Self {
+        let actor_id = ctx.id;
         let deduplicaters: HashMap<_, _> = agg_calls
             .iter()
             .enumerate()
@@ -236,14 +231,18 @@ impl<S: StateStore> DistinctDeduplicater<S> {
             .into_iter()
             .map(|(distinct_col, indices_and_calls)| {
                 let table_id = distinct_dedup_tables.get(&distinct_col).unwrap().table_id();
-                let metrics_info =
-                    MetricsInfo::new(metrics.clone(), table_id, actor_id, "distinct dedup");
+                let metrics_info = MetricsInfo::new(
+                    ctx.streaming_metrics.clone(),
+                    table_id,
+                    actor_id,
+                    "distinct dedup",
+                );
                 let call_indices: Box<[_]> = indices_and_calls.into_iter().map(|v| v.0).collect();
-                let deduplicater = ColumnDeduplicater::new(watermark_epoch, metrics_info);
+                let deduplicater = ColumnDeduplicater::new(watermark_epoch.clone(), metrics_info);
                 (distinct_col, (call_indices, deduplicater))
             })
             .collect();
-        Self { deduplicaters }
+        Self { deduplicaters, ctx }
     }
 
     pub fn dedup_caches_mut(&mut self) -> impl Iterator<Item = &mut DedupCache> {
@@ -261,7 +260,6 @@ impl<S: StateStore> DistinctDeduplicater<S> {
         mut visibilities: Vec<Bitmap>,
         dedup_tables: &mut HashMap<usize, StateTable<S>>,
         group_key: Option<&GroupKey>,
-        ctx: ActorContextRef,
     ) -> StreamExecutorResult<Vec<Bitmap>> {
         for (distinct_col, (ref call_indices, deduplicater)) in &mut self.deduplicaters {
             let column = &columns[*distinct_col];
@@ -277,7 +275,7 @@ impl<S: StateStore> DistinctDeduplicater<S> {
                     visibilities,
                     dedup_table,
                     group_key,
-                    ctx.clone(),
+                    self.ctx.clone(),
                 )
                 .await?;
         }
@@ -288,11 +286,10 @@ impl<S: StateStore> DistinctDeduplicater<S> {
     pub fn flush(
         &mut self,
         dedup_tables: &mut HashMap<usize, StateTable<S>>,
-        ctx: ActorContextRef,
     ) -> StreamExecutorResult<()> {
         for (distinct_col, (_, deduplicater)) in &mut self.deduplicaters {
             let dedup_table = dedup_tables.get_mut(distinct_col).unwrap();
-            deduplicater.flush(dedup_table, ctx.clone());
+            deduplicater.flush(dedup_table, self.ctx.clone());
         }
         Ok(())
     }
@@ -309,7 +306,6 @@ mod tests {
     use risingwave_storage::memory::MemoryStateStore;
 
     use super::*;
-    use crate::executor::monitor::StreamingMetrics;
     use crate::executor::ActorContext;
 
     async fn infer_dedup_tables<S: StateStore>(
@@ -394,10 +390,9 @@ mod tests {
 
         let mut deduplicater = DistinctDeduplicater::new(
             &agg_calls,
-            &Arc::new(AtomicU64::new(0)),
+            Arc::new(AtomicU64::new(0)),
             &dedup_tables,
-            0,
-            Arc::new(StreamingMetrics::unused()),
+            ActorContext::create(0),
         );
 
         // --- chunk 1 ---
@@ -413,14 +408,7 @@ mod tests {
             .take(agg_calls.len())
             .collect_vec();
         let visibilities = deduplicater
-            .dedup_chunk(
-                &ops,
-                &columns,
-                visibilities,
-                &mut dedup_tables,
-                None,
-                ActorContext::create(0),
-            )
+            .dedup_chunk(&ops, &columns, visibilities, &mut dedup_tables, None)
             .await
             .unwrap();
         assert_eq!(
@@ -440,9 +428,7 @@ mod tests {
             vec![true, true] // distinct on b
         );
 
-        deduplicater
-            .flush(&mut dedup_tables, ActorContext::create(0))
-            .unwrap();
+        deduplicater.flush(&mut dedup_tables).unwrap();
 
         epoch.inc();
         for table in dedup_tables.values_mut() {
@@ -463,14 +449,7 @@ mod tests {
             .take(agg_calls.len())
             .collect_vec();
         let visibilities = deduplicater
-            .dedup_chunk(
-                &ops,
-                &columns,
-                visibilities,
-                &mut dedup_tables,
-                None,
-                ActorContext::create(0),
-            )
+            .dedup_chunk(&ops, &columns, visibilities, &mut dedup_tables, None)
             .await
             .unwrap();
         assert_eq!(
@@ -490,9 +469,7 @@ mod tests {
             vec![false, false, true] // distinct on b
         );
 
-        deduplicater
-            .flush(&mut dedup_tables, ActorContext::create(0))
-            .unwrap();
+        deduplicater.flush(&mut dedup_tables).unwrap();
 
         epoch.inc();
         for table in dedup_tables.values_mut() {
@@ -504,10 +481,9 @@ mod tests {
         // test recovery
         let mut deduplicater = DistinctDeduplicater::new(
             &agg_calls,
-            &Arc::new(AtomicU64::new(0)),
+            Arc::new(AtomicU64::new(0)),
             &dedup_tables,
-            0,
-            Arc::new(StreamingMetrics::unused()),
+            ActorContext::create(0),
         );
 
         // --- chunk 3 ---
@@ -524,14 +500,7 @@ mod tests {
             .take(agg_calls.len())
             .collect_vec();
         let visibilities = deduplicater
-            .dedup_chunk(
-                &ops,
-                &columns,
-                visibilities,
-                &mut dedup_tables,
-                None,
-                ActorContext::create(0),
-            )
+            .dedup_chunk(&ops, &columns, visibilities, &mut dedup_tables, None)
             .await
             .unwrap();
         assert_eq!(
@@ -566,9 +535,7 @@ mod tests {
             ]
         );
 
-        deduplicater
-            .flush(&mut dedup_tables, ActorContext::create(0))
-            .unwrap();
+        deduplicater.flush(&mut dedup_tables).unwrap();
 
         epoch.inc();
         for table in dedup_tables.values_mut() {
@@ -603,10 +570,9 @@ mod tests {
 
         let mut deduplicater = DistinctDeduplicater::new(
             &agg_calls,
-            &Arc::new(AtomicU64::new(0)),
+            Arc::new(AtomicU64::new(0)),
             &dedup_tables,
-            0,
-            Arc::new(StreamingMetrics::unused()),
+            ActorContext::create(0),
         );
 
         let chunk = StreamChunk::from_pretty(
@@ -629,7 +595,6 @@ mod tests {
                 visibilities,
                 &mut dedup_tables,
                 Some(&group_key),
-                ActorContext::create(0),
             )
             .await
             .unwrap();
@@ -646,9 +611,7 @@ mod tests {
             vec![true, true, false, false, true] // distinct on b
         );
 
-        deduplicater
-            .flush(&mut dedup_tables, ActorContext::create(0))
-            .unwrap();
+        deduplicater.flush(&mut dedup_tables).unwrap();
 
         epoch.inc();
         for table in dedup_tables.values_mut() {
@@ -673,7 +636,6 @@ mod tests {
                 visibilities,
                 &mut dedup_tables,
                 Some(&group_key),
-                ActorContext::create(0),
             )
             .await
             .unwrap();
@@ -700,9 +662,7 @@ mod tests {
             ]
         );
 
-        deduplicater
-            .flush(&mut dedup_tables, ActorContext::create(0))
-            .unwrap();
+        deduplicater.flush(&mut dedup_tables).unwrap();
 
         epoch.inc();
         for table in dedup_tables.values_mut() {
diff --git a/src/stream/src/executor/backfill/arrangement_backfill.rs b/src/stream/src/executor/backfill/arrangement_backfill.rs
index 0bd6e47841584..28fcaa8862faa 100644
--- a/src/stream/src/executor/backfill/arrangement_backfill.rs
+++ b/src/stream/src/executor/backfill/arrangement_backfill.rs
@@ -12,35 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::HashMap;
 use std::pin::pin;
 use std::sync::Arc;
 
 use either::Either;
 use futures::stream::select_with_strategy;
-use futures::{pin_mut, stream, StreamExt, TryStreamExt};
+use futures::{stream, StreamExt, TryStreamExt};
 use futures_async_stream::try_stream;
 use itertools::Itertools;
 use risingwave_common::array::{Op, StreamChunk};
 use risingwave_common::bail;
 use risingwave_common::catalog::Schema;
 use risingwave_common::hash::{VirtualNode, VnodeBitmapExt};
-use risingwave_common::types::Datum;
 use risingwave_common::util::chunk_coalesce::DataChunkBuilder;
 use risingwave_common::util::iter_util::ZipEqDebug;
-use risingwave_common::util::select_all;
+use risingwave_storage::row_serde::value_serde::ValueRowSerde;
 use risingwave_storage::StateStore;
 
-use crate::common::table::state_table::ReplicatedStateTable;
+use crate::common::table::state_table::{ReplicatedStateTable, StateTable};
+#[cfg(debug_assertions)]
+use crate::executor::backfill::utils::METADATA_STATE_LEN;
 use crate::executor::backfill::utils::{
-    compute_bounds, construct_initial_finished_state, get_progress_per_vnode, iter_chunks,
-    mapping_chunk, mapping_message, mark_chunk_ref_by_vnode, owned_row_iter,
-    persist_state_per_vnode, update_pos_by_vnode, BackfillProgressPerVnode, BackfillState,
+    compute_bounds, create_builder, get_progress_per_vnode, iter_chunks, mapping_chunk,
+    mapping_message, mark_chunk_ref_by_vnode, owned_row_iter, persist_state_per_vnode,
+    update_pos_by_vnode, BackfillProgressPerVnode, BackfillState,
 };
 use crate::executor::monitor::StreamingMetrics;
 use crate::executor::{
     expect_first_barrier, Barrier, BoxedExecutor, BoxedMessageStream, Executor, ExecutorInfo,
-    Message, PkIndices, PkIndicesRef, StreamExecutorError,
+    Message, PkIndicesRef, StreamExecutorError,
 };
 use crate::task::{ActorId, CreateMviewProgress};
 
@@ -49,15 +49,15 @@ use crate::task::{ActorId, CreateMviewProgress};
 /// - [`ArrangementBackfillExecutor`] can reside on a different CN, so it can be scaled
 ///   independently.
 /// - To synchronize upstream shared buffer, it is initialized with a [`ReplicatedStateTable`].
-pub struct ArrangementBackfillExecutor<S: StateStore> {
+pub struct ArrangementBackfillExecutor<S: StateStore, SD: ValueRowSerde> {
     /// Upstream table
-    upstream_table: ReplicatedStateTable<S>,
+    upstream_table: ReplicatedStateTable<S, SD>,
 
     /// Upstream with the same schema with the upstream table.
     upstream: BoxedExecutor,
 
     /// Internal state table for persisting state of backfill state.
-    state_table: ReplicatedStateTable<S>,
+    state_table: StateTable<S>,
 
     /// The column indices need to be forwarded to the downstream from the upstream and table scan.
     output_indices: Vec<usize>,
@@ -71,31 +71,30 @@ pub struct ArrangementBackfillExecutor<S: StateStore> {
     metrics: Arc<StreamingMetrics>,
 
     chunk_size: usize,
+
+    rate_limit: Option<usize>,
 }
 
-impl<S> ArrangementBackfillExecutor<S>
+impl<S, SD> ArrangementBackfillExecutor<S, SD>
 where
     S: StateStore,
+    SD: ValueRowSerde,
 {
     #[allow(clippy::too_many_arguments)]
     #[allow(dead_code)]
     pub fn new(
-        upstream_table: ReplicatedStateTable<S>,
+        info: ExecutorInfo,
+        upstream_table: ReplicatedStateTable<S, SD>,
         upstream: BoxedExecutor,
-        state_table: ReplicatedStateTable<S>,
+        state_table: StateTable<S>,
         output_indices: Vec<usize>,
         progress: CreateMviewProgress,
-        schema: Schema,
-        pk_indices: PkIndices,
         metrics: Arc<StreamingMetrics>,
         chunk_size: usize,
+        rate_limit: Option<usize>,
     ) -> Self {
         Self {
-            info: ExecutorInfo {
-                schema,
-                pk_indices,
-                identity: "ArrangementBackfillExecutor".to_owned(),
-            },
+            info,
             upstream_table,
             upstream,
             state_table,
@@ -104,6 +103,7 @@ where
             progress,
             metrics,
             chunk_size,
+            rate_limit,
         }
     }
 
@@ -111,79 +111,62 @@ where
     async fn execute_inner(mut self) {
         // The primary key columns, in the output columns of the upstream_table scan.
         // Table scan scans a subset of the columns of the upstream table.
-        let pk_in_output_indices = self
-            .upstream_table
-            .pk_indices()
-            .iter()
-            .map(|&i| self.output_indices.iter().position(|&j| i == j))
-            .collect::<Option<Vec<_>>>()
-            .unwrap();
-        let state_len = pk_in_output_indices.len() + 2; // +1 for backfill_finished, +1 for vnode key.
+        let pk_in_output_indices = self.upstream_table.pk_in_output_indices().unwrap();
+        #[cfg(debug_assertions)]
+        let state_len = self.upstream_table.pk_indices().len() + METADATA_STATE_LEN;
         let pk_order = self.upstream_table.pk_serde().get_order_types().to_vec();
         let upstream_table_id = self.upstream_table.table_id();
         let mut upstream_table = self.upstream_table;
         let vnodes = upstream_table.vnodes().clone();
 
-        let schema = Arc::new(self.upstream.schema().clone());
+        // These builders will build data chunks.
+        // We must supply them with the full datatypes which correspond to
+        // pk + output_indices.
+        let snapshot_data_types = self
+            .upstream
+            .schema()
+            .fields()
+            .iter()
+            .map(|field| field.data_type.clone())
+            .collect_vec();
+        let mut builders = upstream_table
+            .vnodes()
+            .iter_vnodes()
+            .map(|_| {
+                create_builder(
+                    self.rate_limit,
+                    self.chunk_size,
+                    snapshot_data_types.clone(),
+                )
+            })
+            .collect_vec();
 
         let mut upstream = self.upstream.execute();
 
         // Poll the upstream to get the first barrier.
         let first_barrier = expect_first_barrier(&mut upstream).await?;
-        self.state_table.init_epoch(first_barrier.epoch).await?;
+        let first_epoch = first_barrier.epoch;
+        self.state_table.init_epoch(first_barrier.epoch);
 
         let progress_per_vnode = get_progress_per_vnode(&self.state_table).await?;
 
-        let is_completely_finished = progress_per_vnode
-            .iter()
-            .all(|(_, p)| *p == BackfillProgressPerVnode::Completed);
+        let is_completely_finished = progress_per_vnode.iter().all(|(_, p)| {
+            matches!(
+                p.current_state(),
+                &BackfillProgressPerVnode::Completed { .. }
+            )
+        });
         if is_completely_finished {
             assert!(!first_barrier.is_newly_added(self.actor_id));
         }
 
-        let mut backfill_state: BackfillState = progress_per_vnode.into();
-        let mut committed_progress = HashMap::new();
-
-        let mut builders = upstream_table
-            .vnodes()
-            .iter_vnodes()
-            .map(|_| DataChunkBuilder::new(schema.data_types(), self.chunk_size))
-            .collect_vec();
-
-        // If the snapshot is empty, we don't need to backfill.
-        // We cannot complete progress now, as we want to persist
-        // finished state to state store first.
-        // As such we will wait for next barrier.
-        let is_snapshot_empty: bool = {
-            if is_completely_finished {
-                // It is finished, so just assign a value to avoid accessing storage table again.
-                false
-            } else {
-                let snapshot = Self::snapshot_read_per_vnode(
-                    &upstream_table,
-                    backfill_state.clone(), // FIXME: temporary workaround... How to avoid it?
-                    &mut builders,
-                );
-                pin_mut!(snapshot);
-                snapshot.try_next().await?.unwrap().is_none()
-            }
-        };
-
-        // | backfill_is_finished | snapshot_empty | -> | need_to_backfill |
-        // | -------------------- | -------------- | -- | ---------------- |
-        // | t                    | t/f            | -> | f                |
-        // | f                    | t              | -> | f                |
-        // | f                    | f              | -> | t                |
-        let to_backfill = !is_completely_finished && !is_snapshot_empty;
-
-        // Use these to persist state.
-        // They contain the backfill position, and the progress.
-        // However, they do not contain the vnode key (index 0).
-        // That is filled in when we flush the state table.
-        let mut temporary_state: Vec<Datum> = vec![None; state_len];
-
         // The first barrier message should be propagated.
         yield Message::Barrier(first_barrier);
+        upstream_table.init_epoch(first_epoch).await?;
+
+        let mut backfill_state: BackfillState = progress_per_vnode.into();
+
+        let to_backfill = !is_completely_finished;
 
         // If no need backfill, but state was still "unfinished" we need to finish it.
         // So we just update the state + progress to meta at the next barrier to finish progress,
@@ -242,7 +225,7 @@ where
 
                     let right_snapshot = pin!(Self::snapshot_read_per_vnode(
                         &upstream_table,
-                        backfill_state.clone(), // FIXME: temporary workaround, how to avoid it?
+                        backfill_state.clone(), // FIXME: Use mutable reference instead.
                         &mut builders,
                     )
                     .map(Either::Right),);
@@ -311,15 +294,16 @@ where
                                             &chunk,
                                             &pk_in_output_indices,
                                             &mut backfill_state,
-                                        );
+                                        )?;
 
                                         let chunk_cardinality = chunk.cardinality() as u64;
                                         cur_barrier_snapshot_processed_rows += chunk_cardinality;
                                         total_snapshot_processed_rows += chunk_cardinality;
-                                        yield Message::Chunk(mapping_chunk(
+                                        let chunk = Message::Chunk(mapping_chunk(
                                             chunk,
                                             &self.output_indices,
                                         ));
+                                        yield chunk;
                                     }
                                 }
                             }
@@ -335,7 +319,6 @@ where
                     Some(barrier) => barrier,
                     None => bail!("BUG: current_backfill loop exited without a barrier"),
                 };
-                // TODO: Process existing buffered snapshots.
 
                 // Process barrier:
                 // - consume snapshot rows left in builder.
@@ -360,7 +343,7 @@ where
                             &chunk,
                             &pk_in_output_indices,
                             &mut backfill_state,
-                        );
+                        )?;
 
                         let chunk_cardinality = chunk.cardinality() as u64;
                         cur_barrier_snapshot_processed_rows += chunk_cardinality;
@@ -388,8 +371,9 @@ where
                             &self.output_indices,
                         ));
                     }
-                    // Replicate
-                    upstream_table.write_chunk(chunk);
+
+                    // FIXME(kwannoel): Replicate
+                    // upstream_table.write_chunk(chunk);
                 }
 
                 if upstream_chunk_buffer_is_empty {
@@ -417,6 +401,9 @@ where
                 // Update snapshot read epoch.
                 snapshot_read_epoch = barrier.epoch.prev;
 
+                // TODO(kwannoel): Not sure if this holds for arrangement backfill.
+                // May need to revisit it.
+                // Need to check it after scale-in / scale-out.
                 self.progress.update(
                     barrier.epoch.curr,
                     snapshot_read_epoch,
@@ -427,13 +414,19 @@ where
                 persist_state_per_vnode(
                     barrier.epoch,
                     &mut self.state_table,
-                    false,
-                    &backfill_state,
-                    &mut committed_progress,
-                    &mut temporary_state,
+                    &mut backfill_state,
+                    #[cfg(debug_assertions)]
+                    state_len,
+                    vnodes.iter_vnodes(),
                 )
                 .await?;
 
+                tracing::trace!(
+                    actor = self.actor_id,
+                    barrier = ?barrier,
+                    "barrier persisted"
+                );
+
                 yield Message::Barrier(barrier);
 
                 // We will switch snapshot at the start of the next iteration of the backfill loop.
@@ -442,13 +435,20 @@ where
 
         tracing::trace!(
             actor = self.actor_id,
-            "Backfill has already finished and forward messages directly to the downstream"
+            "Arrangement Backfill has finished and forward messages directly to the downstream"
         );
 
+        // Update our progress as finished in state table.
+
         // Wait for first barrier to come after backfill is finished.
         // So we can update our progress + persist the status.
         while let Some(Ok(msg)) = upstream.next().await {
             if let Some(msg) = mapping_message(msg, &self.output_indices) {
+                tracing::trace!(
+                    actor = self.actor_id,
+                    message = ?msg,
+                    "backfill_finished_wait_for_barrier"
+                );
                 // If not finished then we need to update state, otherwise no need.
                 if let Message::Barrier(barrier) = &msg
                     && !is_completely_finished
@@ -459,24 +459,17 @@ where
                     // This is because we can't update state table in first epoch,
                     // since it expects to have been initialized in previous epoch
                     // (there's no epoch before the first epoch).
-                    if is_snapshot_empty {
-                        let finished_state =
-                            construct_initial_finished_state(pk_in_output_indices.len());
-                        for vnode in upstream_table.vnodes().iter_vnodes() {
-                            backfill_state.update_progress(
-                                vnode,
-                                BackfillProgressPerVnode::InProgress(finished_state.clone()),
-                            );
-                        }
+                    for vnode in upstream_table.vnodes().iter_vnodes() {
+                        backfill_state.finish_progress(vnode, upstream_table.pk_indices().len());
                     }
 
                     persist_state_per_vnode(
                         barrier.epoch,
                         &mut self.state_table,
-                        false,
-                        &backfill_state,
-                        &mut committed_progress,
-                        &mut temporary_state,
+                        &mut backfill_state,
+                        #[cfg(debug_assertions)]
+                        state_len,
+                        vnodes.iter_vnodes()
                     )
                     .await?;
 
@@ -484,8 +477,10 @@ where
                         .finish(barrier.epoch.curr, total_snapshot_processed_rows);
                     yield msg;
                     break;
+                } else {
+                    // Allow other messages to pass through.
+                    yield msg;
                 }
-                yield msg;
             }
         }
 
@@ -495,6 +490,11 @@ where
         #[for_await]
         for msg in upstream {
             if let Some(msg) = mapping_message(msg?, &self.output_indices) {
+                tracing::trace!(
+                    actor = self.actor_id,
+                    message = ?msg,
+                    "backfill_finished_after_barrier"
+                );
                 if let Message::Barrier(barrier) = &msg {
                     self.state_table.commit_no_data_expected(barrier.epoch);
                 }
@@ -510,19 +510,18 @@ where
     /// 3. Change it into a chunk iterator with `iter_chunks`.
     /// This means it should fetch a row from each iterator to form a chunk.
     ///
-    /// We will return chunks based on the `BackfillProgressPerVnode`.
-    /// 1. Completed(vnode): Current iterator is complete, in that case we need to handle it
-    ///    in arrangement backfill. We should not buffer updates for this vnode,
-    ///    and we should forward all messages.
-    /// 2. InProgress(CHUNK): Current iterator is not complete, in that case we
-    ///    need to buffer updates for this vnode.
-    /// 3. Finished: All iterators finished.
-    ///
-    /// NOTE(kwannoel): We interleave at chunk per vnode level rather than rows.
+    /// We interleave at chunk per vnode level rather than rows.
     /// This is so that we can compute `current_pos` once per chunk, since they correspond to 1
     /// vnode.
     ///
-    /// NOTE(kwannoel):
+    /// The stream contains pairs of `(VirtualNode, StreamChunk)`.
+    /// The `VirtualNode` is the vnode that the chunk belongs to.
+    /// The `StreamChunk` is the chunk that contains the rows from the vnode.
+    /// If it's `None`, it means the vnode has no more rows for this snapshot read.
+    ///
+    /// The `snapshot_read_epoch` is supplied as a parameter for `state_table`.
+    /// It is required to ensure we read a fully-checkpointed snapshot the **first time**.
+    ///
     /// The rows from upstream snapshot read will be buffered inside the `builder`.
     /// If snapshot is dropped before its rows are consumed,
     /// remaining data in `builder` must be flushed manually.
@@ -530,11 +529,10 @@ where
     /// present, Then when we flush we contain duplicate rows.
     #[try_stream(ok = Option<(VirtualNode, StreamChunk)>, error = StreamExecutorError)]
     async fn snapshot_read_per_vnode<'a>(
-        upstream_table: &'a ReplicatedStateTable<S>,
+        upstream_table: &'a ReplicatedStateTable<S, SD>,
         backfill_state: BackfillState,
         builders: &'a mut [DataChunkBuilder],
     ) {
-        let mut streams = Vec::with_capacity(upstream_table.vnodes().len());
         for (vnode, builder) in upstream_table
             .vnodes()
             .iter_vnodes()
@@ -542,11 +540,9 @@ where
         {
             let backfill_progress = backfill_state.get_progress(&vnode)?;
             let current_pos = match backfill_progress {
-                BackfillProgressPerVnode::Completed => {
-                    continue;
-                }
                 BackfillProgressPerVnode::NotStarted => None,
-                BackfillProgressPerVnode::InProgress(current_pos) => Some(current_pos.clone()),
+                BackfillProgressPerVnode::Completed(current_pos)
+                | BackfillProgressPerVnode::InProgress(current_pos) => Some(current_pos.clone()),
             };
 
             let range_bounds = compute_bounds(upstream_table.pk_indices(), current_pos.clone());
@@ -555,30 +551,36 @@ where
             }
             let range_bounds = range_bounds.unwrap();
 
+            tracing::trace!(
+                vnode = ?vnode,
+                current_pos = ?current_pos,
+                range_bounds = ?range_bounds,
+                "iter_with_vnode_and_output_indices"
+            );
             let vnode_row_iter = upstream_table
-                .iter_with_vnode(vnode, &range_bounds, Default::default())
+                .iter_with_vnode_and_output_indices(vnode, &range_bounds, Default::default())
                 .await?;
 
-            // TODO: Is there some way to avoid double-pin here?
             let vnode_row_iter = Box::pin(owned_row_iter(vnode_row_iter));
 
-            let vnode_chunk_iter = iter_chunks(vnode_row_iter, builder)
-                .map_ok(move |chunk_opt| chunk_opt.map(|chunk| (vnode, chunk)));
-            // TODO: Is there some way to avoid double-pin
-            streams.push(Box::pin(vnode_chunk_iter));
-        }
-        #[for_await]
-        for chunk in select_all(streams) {
-            yield chunk?;
+            let vnode_chunk_iter =
+                iter_chunks(vnode_row_iter, builder).map_ok(move |chunk| (vnode, chunk));
+
+            // This means we iterate serially rather than in parallel across vnodes.
+            #[for_await]
+            for chunk in vnode_chunk_iter {
+                yield Some(chunk?);
+            }
         }
         yield None;
         return Ok(());
     }
 }
 
-impl<S> Executor for ArrangementBackfillExecutor<S>
+impl<S, SD> Executor for ArrangementBackfillExecutor<S, SD>
 where
     S: StateStore,
+    SD: ValueRowSerde,
 {
     fn execute(self: Box<Self>) -> BoxedMessageStream {
         self.execute_inner().boxed()
diff --git a/src/stream/src/executor/backfill/cdc/upstream_table/snapshot.rs b/src/stream/src/executor/backfill/cdc/upstream_table/snapshot.rs
index 6c5ba1affe212..cc8883aeea6b1 100644
--- a/src/stream/src/executor/backfill/cdc/upstream_table/snapshot.rs
+++ b/src/stream/src/executor/backfill/cdc/upstream_table/snapshot.rs
@@ -104,8 +104,9 @@ impl UpstreamTableRead for UpstreamTableReader<ExternalStorageTable> {
         let chunk_stream = iter_chunks(row_stream, &mut builder);
         #[for_await]
         for chunk in chunk_stream {
-            yield chunk?;
+            yield Some(chunk?);
         }
+        yield None;
     }
 
     async fn current_binlog_offset(&self) -> StreamExecutorResult<Option<CdcOffset>> {
diff --git a/src/stream/src/executor/backfill/no_shuffle_backfill.rs b/src/stream/src/executor/backfill/no_shuffle_backfill.rs
index 05e2df32c9a52..bf1b5709c8920 100644
--- a/src/stream/src/executor/backfill/no_shuffle_backfill.rs
+++ b/src/stream/src/executor/backfill/no_shuffle_backfill.rs
@@ -23,7 +23,7 @@ use risingwave_common::array::{Op, StreamChunk};
 use risingwave_common::catalog::Schema;
 use risingwave_common::hash::VnodeBitmapExt;
 use risingwave_common::row::{OwnedRow, Row};
-use risingwave_common::types::{DataType, Datum};
+use risingwave_common::types::Datum;
 use risingwave_common::util::chunk_coalesce::DataChunkBuilder;
 use risingwave_common::util::epoch::EpochPair;
 use risingwave_common::{bail, row};
@@ -35,8 +35,8 @@ use risingwave_storage::StateStore;
 use crate::common::table::state_table::StateTable;
 use crate::executor::backfill::utils;
 use crate::executor::backfill::utils::{
-    compute_bounds, construct_initial_finished_state, get_new_pos, iter_chunks, mapping_chunk,
-    mapping_message, mark_chunk, owned_row_iter,
+    compute_bounds, construct_initial_finished_state, create_builder, get_new_pos, iter_chunks,
+    mapping_chunk, mapping_message, mark_chunk, owned_row_iter, METADATA_STATE_LEN,
 };
 use crate::executor::monitor::StreamingMetrics;
 use crate::executor::{
@@ -45,9 +45,6 @@ use crate::executor::{
 };
 use crate::task::{ActorId, CreateMviewProgress};
 
-/// vnode, `is_finished`, `row_count`, all occupy 1 column each.
-const METADATA_STATE_LEN: usize = 3;
-
 /// Schema: | vnode | pk ... | `backfill_finished` | `row_count` |
 /// We can decode that into `BackfillState` on recovery.
 #[derive(Debug, Eq, PartialEq)]
@@ -170,7 +167,7 @@ where
             .await?;
         tracing::trace!(is_finished, row_count, "backfill state recovered");
 
-        let mut builder = Self::create_builder(
+        let mut builder = create_builder(
             rate_limit,
             self.chunk_size,
             self.upstream_table.schema().data_types(),
@@ -457,7 +454,7 @@ where
                                 "actor rate limit changed",
                             );
                             assert!(builder.is_empty());
-                            builder = Self::create_builder(
+                            builder = create_builder(
                                 rate_limit,
                                 self.chunk_size,
                                 self.upstream_table.schema().data_types(),
@@ -646,14 +643,14 @@ where
                 PrefetchOptions::prefetch_for_small_range_scan(),
             )
             .await?;
-
         let row_iter = owned_row_iter(iter);
         pin_mut!(row_iter);
 
         #[for_await]
         for chunk in iter_chunks(row_iter, builder) {
-            yield chunk?;
+            yield Some(chunk?);
         }
+        yield None;
     }
 
     async fn persist_state(
@@ -678,29 +675,6 @@ where
         )
         .await
     }
-
-    /// Creates a data chunk builder for snapshot read.
-    /// If the `rate_limit` is smaller than `chunk_size`, it will take precedence.
-    /// This is so we can partition snapshot read into smaller chunks than chunk size.
-    fn create_builder(
-        rate_limit: Option<usize>,
-        chunk_size: usize,
-        data_types: Vec<DataType>,
-    ) -> DataChunkBuilder {
-        if let Some(rate_limit) = rate_limit
-            && rate_limit < chunk_size
-        {
-            DataChunkBuilder::new(
-                data_types,
-                rate_limit,
-            )
-        } else {
-            DataChunkBuilder::new(
-                data_types,
-                chunk_size,
-            )
-        }
-    }
 }
 
 impl<S> Executor for BackfillExecutor<S>
diff --git a/src/stream/src/executor/backfill/utils.rs b/src/stream/src/executor/backfill/utils.rs
index 663f9be94cf5e..d344b23c294dc 100644
--- a/src/stream/src/executor/backfill/utils.rs
+++ b/src/stream/src/executor/backfill/utils.rs
@@ -27,7 +27,7 @@ use risingwave_common::bail;
 use risingwave_common::buffer::BitmapBuilder;
 use risingwave_common::hash::{VirtualNode, VnodeBitmapExt};
 use risingwave_common::row::{OwnedRow, Row, RowExt};
-use risingwave_common::types::Datum;
+use risingwave_common::types::{DataType, Datum};
 use risingwave_common::util::chunk_coalesce::DataChunkBuilder;
 use risingwave_common::util::epoch::EpochPair;
 use risingwave_common::util::iter_util::ZipEqDebug;
@@ -45,21 +45,31 @@ use crate::executor::{
     Message, PkIndicesRef, StreamExecutorError, StreamExecutorResult, Watermark,
 };
 
+/// `vnode`, `is_finished`, `row_count`, all occupy 1 column each.
+pub const METADATA_STATE_LEN: usize = 3;
+
 #[derive(Clone, Debug)]
 pub struct BackfillState {
     /// Used to track backfill progress.
-    inner: HashMap<VirtualNode, BackfillProgressPerVnode>,
+    // TODO: Instead of using hashmap, perhaps we can just use static array.
+    inner: HashMap<VirtualNode, BackfillStatePerVnode>,
 }
 
 impl BackfillState {
-    fn has_no_progress(&self) -> bool {
-        self.inner
-            .values()
-            .all(|p| !matches!(p, BackfillProgressPerVnode::InProgress(_)))
+    pub(crate) fn has_progress(&self) -> bool {
+        self.inner.values().any(|p| {
+            matches!(
+                p.current_state(),
+                &BackfillProgressPerVnode::InProgress { .. }
+            )
+        })
     }
 
-    pub(crate) fn has_progress(&self) -> bool {
-        !self.has_no_progress()
+    pub(crate) fn get_current_state(
+        &mut self,
+        vnode: &VirtualNode,
+    ) -> &mut BackfillProgressPerVnode {
+        &mut self.inner.get_mut(vnode).unwrap().current_state
     }
 
     // Expects the vnode to always have progress, otherwise it will return an error.
@@ -68,7 +78,7 @@ impl BackfillState {
         vnode: &VirtualNode,
     ) -> StreamExecutorResult<&BackfillProgressPerVnode> {
         match self.inner.get(vnode) {
-            Some(p) => Ok(p),
+            Some(p) => Ok(p.current_state()),
             None => bail!(
                     "Backfill progress for vnode {:#?} not found, backfill_state not initialized properly",
                     vnode,
@@ -79,20 +89,132 @@ impl BackfillState {
     pub(crate) fn update_progress(
         &mut self,
         vnode: VirtualNode,
-        progress: BackfillProgressPerVnode,
-    ) -> Option<BackfillProgressPerVnode> {
-        self.inner.insert(vnode, progress)
+        new_pos: OwnedRow,
+    ) -> StreamExecutorResult<()> {
+        let state = self.get_current_state(&vnode);
+        let new_state = BackfillProgressPerVnode::InProgress(new_pos);
+        match state {
+            BackfillProgressPerVnode::NotStarted => *state = new_state,
+            BackfillProgressPerVnode::InProgress(_current_pos) => *state = new_state,
+            BackfillProgressPerVnode::Completed { .. } => unreachable!(),
+        }
+        Ok(())
     }
 
-    fn iter_backfill_progress(
-        &self,
-    ) -> impl Iterator<Item = (&VirtualNode, &BackfillProgressPerVnode)> {
-        self.inner.iter()
+    pub(crate) fn finish_progress(&mut self, vnode: VirtualNode, pos_len: usize) {
+        let finished_placeholder_position = construct_initial_finished_state(pos_len);
+        let current_state = self.get_current_state(&vnode);
+        let new_pos = match current_state {
+            BackfillProgressPerVnode::NotStarted => finished_placeholder_position,
+            BackfillProgressPerVnode::InProgress(current_pos) => current_pos.clone(),
+            BackfillProgressPerVnode::Completed { .. } => {
+                return;
+            }
+        };
+        *current_state = BackfillProgressPerVnode::Completed(new_pos);
+    }
+
+    /// Return state to be committed.
+    fn get_commit_state(&self, vnode: &VirtualNode) -> Option<(Option<Vec<Datum>>, Vec<Datum>)> {
+        let new_state = self.inner.get(vnode).unwrap().current_state().clone();
+        let new_encoded_state = match new_state {
+            BackfillProgressPerVnode::NotStarted => unreachable!(),
+            BackfillProgressPerVnode::InProgress(current_pos) => {
+                let mut encoded_state = vec![None; current_pos.len() + METADATA_STATE_LEN];
+                encoded_state[0] = Some(vnode.to_scalar().into());
+                encoded_state[1..current_pos.len() + 1].clone_from_slice(current_pos.as_inner());
+                encoded_state[current_pos.len() + 1] = Some(false.into());
+                encoded_state[current_pos.len() + 2] = Some(0i64.into());
+                encoded_state
+            }
+            BackfillProgressPerVnode::Completed(current_pos) => {
+                let mut encoded_state = vec![None; current_pos.len() + METADATA_STATE_LEN];
+                encoded_state[0] = Some(vnode.to_scalar().into());
+                encoded_state[1..current_pos.len() + 1].clone_from_slice(current_pos.as_inner());
+                encoded_state[current_pos.len() + 1] = Some(true.into());
+                encoded_state[current_pos.len() + 2] = Some(0i64.into());
+                encoded_state
+            }
+        };
+        let old_state = self.inner.get(vnode).unwrap().committed_state().clone();
+        let old_encoded_state = match old_state {
+            BackfillProgressPerVnode::NotStarted => None,
+            BackfillProgressPerVnode::InProgress(committed_pos) => {
+                let mut encoded_state = vec![None; committed_pos.len() + METADATA_STATE_LEN];
+                encoded_state[0] = Some(vnode.to_scalar().into());
+                encoded_state[1..committed_pos.len() + 1]
+                    .clone_from_slice(committed_pos.as_inner());
+                encoded_state[committed_pos.len() + 1] = Some(false.into());
+                encoded_state[committed_pos.len() + 2] = Some(0i64.into());
+                Some(encoded_state)
+            }
+            BackfillProgressPerVnode::Completed(committed_pos) => {
+                let mut encoded_state = vec![None; committed_pos.len() + METADATA_STATE_LEN];
+                encoded_state[0] = Some(vnode.to_scalar().into());
+                encoded_state[1..committed_pos.len() + 1]
+                    .clone_from_slice(committed_pos.as_inner());
+                encoded_state[committed_pos.len() + 1] = Some(true.into());
+                encoded_state[committed_pos.len() + 2] = Some(0i64.into());
+                Some(encoded_state)
+            }
+        };
+        Some((old_encoded_state, new_encoded_state))
+    }
+
+    // TODO: We can add a committed flag to speed up this check.
+    /// Checks if the state needs to be committed.
+    fn need_commit(&self, vnode: &VirtualNode) -> bool {
+        let state = self.inner.get(vnode).unwrap();
+        match state.current_state() {
+            // If current state and committed state are the same, we don't need to commit.
+            s @ BackfillProgressPerVnode::InProgress(_current_pos)
+            | s @ BackfillProgressPerVnode::Completed(_current_pos) => s != state.committed_state(),
+            BackfillProgressPerVnode::NotStarted => false,
+        }
+    }
+
+    fn mark_committed(&mut self, vnode: VirtualNode) {
+        let BackfillStatePerVnode {
+            committed_state,
+            current_state,
+        } = self.inner.get_mut(&vnode).unwrap();
+
+        assert!(matches!(
+            current_state,
+            BackfillProgressPerVnode::InProgress(_) | BackfillProgressPerVnode::Completed(_)
+        ));
+        *committed_state = current_state.clone();
+    }
+}
+
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct BackfillStatePerVnode {
+    committed_state: BackfillProgressPerVnode,
+    current_state: BackfillProgressPerVnode,
+}
+
+impl BackfillStatePerVnode {
+    pub(crate) fn new(
+        committed_state: BackfillProgressPerVnode,
+        current_state: BackfillProgressPerVnode,
+    ) -> Self {
+        Self {
+            committed_state,
+            current_state,
+        }
+    }
+
+    pub(crate) fn committed_state(&self) -> &BackfillProgressPerVnode {
+        &self.committed_state
+    }
+
+    pub(crate) fn current_state(&self) -> &BackfillProgressPerVnode {
+        &self.current_state
     }
 }
 
-impl From<Vec<(VirtualNode, BackfillProgressPerVnode)>> for BackfillState {
-    fn from(v: Vec<(VirtualNode, BackfillProgressPerVnode)>) -> Self {
+impl From<Vec<(VirtualNode, BackfillStatePerVnode)>> for BackfillState {
+    fn from(v: Vec<(VirtualNode, BackfillStatePerVnode)>) -> Self {
         Self {
             inner: v.into_iter().collect(),
         }
@@ -100,11 +222,13 @@ impl From<Vec<(VirtualNode, BackfillProgressPerVnode)>> for BackfillState {
 }
 
 /// Used for tracking backfill state per vnode
+/// The `OwnedRow` only contains the pk of upstream, to track `current_pos`.
 #[derive(Clone, Eq, PartialEq, Debug)]
 pub enum BackfillProgressPerVnode {
+    /// no entry exists for a vnode, or on initialization of the executor.
     NotStarted,
     InProgress(OwnedRow),
-    Completed,
+    Completed(OwnedRow),
 }
 
 pub(crate) fn mark_chunk(
@@ -155,11 +279,14 @@ pub(crate) fn mark_chunk_ref_by_vnode(
         // I will revisit it again when arrangement_backfill is implemented e2e.
         let vnode = VirtualNode::compute_row(row, pk_in_output_indices);
         let v = match backfill_state.get_progress(&vnode)? {
-            BackfillProgressPerVnode::Completed => true,
+            // We want to just forward the row, if the vnode has finished backfill.
+            BackfillProgressPerVnode::Completed(_) => true,
+            // If not started, no need to forward.
             BackfillProgressPerVnode::NotStarted => false,
+            // If in progress, we need to check row <= current_pos.
             BackfillProgressPerVnode::InProgress(current_pos) => {
                 let lhs = row.project(pk_in_output_indices);
-                let rhs = current_pos.project(pk_in_output_indices);
+                let rhs = current_pos;
                 let order = cmp_datum_iter(lhs.iter(), rhs.iter(), pk_order.iter().copied());
                 match order {
                     Ordering::Less | Ordering::Equal => true,
@@ -274,10 +401,10 @@ pub(crate) fn mapping_message(msg: Message, upstream_indices: &[usize]) -> Optio
     }
 }
 
-/// Gets progress per vnode, so we know which to backfill.
+/// Recovers progress per vnode, so we know which to backfill.
 pub(crate) async fn get_progress_per_vnode<S: StateStore, const IS_REPLICATED: bool>(
     state_table: &StateTableInner<S, BasicSerde, IS_REPLICATED>,
-) -> StreamExecutorResult<Vec<(VirtualNode, BackfillProgressPerVnode)>> {
+) -> StreamExecutorResult<Vec<(VirtualNode, BackfillStatePerVnode)>> {
     debug_assert!(!state_table.vnode_bitmap().is_empty());
     let vnodes = state_table.vnodes().iter_vnodes();
     let mut result = Vec::with_capacity(state_table.vnodes().len());
@@ -292,19 +419,36 @@ pub(crate) async fn get_progress_per_vnode<S: StateStore, const IS_REPLICATED: b
         .iter_vnodes()
         .zip_eq_debug(states_for_vnode_keys)
     {
+        // NOTE(kwannoel): state_for_vnode_key does not include the vnode prefix.
         let backfill_progress = match state_for_vnode_key {
             Some(row) => {
-                let vnode_is_finished = row.last().unwrap();
-                if vnode_is_finished.into_bool() {
-                    BackfillProgressPerVnode::Completed
+                let vnode_is_finished = row.as_inner().get(row.len() - 2).unwrap();
+                let vnode_is_finished = vnode_is_finished.as_ref().unwrap();
+
+                // Only the current pos should be contained in the in-memory backfill state.
+                // Row count will be added later.
+                let current_pos = row.as_inner().get(..row.len() - 2).unwrap();
+                let current_pos = current_pos.into_owned_row();
+                if *vnode_is_finished.as_bool() {
+                    BackfillStatePerVnode::new(
+                        BackfillProgressPerVnode::Completed(current_pos.clone()),
+                        BackfillProgressPerVnode::Completed(current_pos),
+                    )
                 } else {
-                    BackfillProgressPerVnode::InProgress(row)
+                    BackfillStatePerVnode::new(
+                        BackfillProgressPerVnode::InProgress(current_pos.clone()),
+                        BackfillProgressPerVnode::InProgress(current_pos),
+                    )
                 }
             }
-            None => BackfillProgressPerVnode::NotStarted,
+            None => BackfillStatePerVnode::new(
+                BackfillProgressPerVnode::NotStarted,
+                BackfillProgressPerVnode::NotStarted,
+            ),
         };
         result.push((vnode, backfill_progress));
     }
+    assert_eq!(result.len(), state_table.vnodes().count_ones());
     Ok(result)
 }
 
@@ -345,18 +489,6 @@ pub(crate) async fn flush_data<S: StateStore, const IS_REPLICATED: bool>(
     table.commit(epoch).await
 }
 
-/// We want to avoid allocating a row for every vnode.
-pub(crate) fn build_temporary_state_with_vnode(
-    row_state: &mut [Datum],
-    vnode: VirtualNode,
-    is_finished: bool,
-    current_pos: &OwnedRow,
-) {
-    row_state[1..current_pos.len() + 1].clone_from_slice(current_pos.as_inner());
-    row_state[current_pos.len() + 1] = Some(is_finished.into());
-    row_state[0] = Some(vnode.to_scalar().into());
-}
-
 /// We want to avoid allocating a row for every vnode.
 /// Instead we can just modify a single row, and dispatch it to state table to write.
 /// This builds the following segments of the row:
@@ -380,9 +512,11 @@ pub(crate) fn update_pos_by_vnode(
     chunk: &StreamChunk,
     pk_in_output_indices: &[usize],
     backfill_state: &mut BackfillState,
-) {
+) -> StreamExecutorResult<()> {
     let new_pos = get_new_pos(chunk, pk_in_output_indices);
-    backfill_state.update_progress(vnode, BackfillProgressPerVnode::InProgress(new_pos));
+    assert_eq!(new_pos.len(), pk_in_output_indices.len());
+    backfill_state.update_progress(vnode, new_pos)?;
+    Ok(())
 }
 
 /// Get new backfill pos from the chunk. Since chunk should have ordered rows, we can just take the
@@ -452,11 +586,12 @@ where
     }
 }
 
-#[try_stream(ok = Option<StreamChunk>, error = StreamExecutorError)]
-pub(crate) async fn iter_chunks<'a, S, E>(mut iter: S, builder: &'a mut DataChunkBuilder)
+#[try_stream(ok = StreamChunk, error = StreamExecutorError)]
+pub(crate) async fn iter_chunks<'a, S, E, R>(mut iter: S, builder: &'a mut DataChunkBuilder)
 where
     StreamExecutorError: From<E>,
-    S: Stream<Item = Result<OwnedRow, E>> + Unpin + 'a,
+    R: Row,
+    S: Stream<Item = Result<R, E>> + Unpin + 'a,
 {
     while let Some(data_chunk) = collect_data_chunk_with_builder(&mut iter, builder)
         .instrument_await("backfill_snapshot_read")
@@ -465,63 +600,95 @@ where
         debug_assert!(data_chunk.cardinality() > 0);
         let ops = vec![Op::Insert; data_chunk.capacity()];
         let stream_chunk = StreamChunk::from_parts(ops, data_chunk);
-        yield Some(stream_chunk);
+        yield stream_chunk;
     }
-
-    yield None;
 }
 
 /// Schema
 /// | vnode | pk | `backfill_finished` |
-/// Persists the state per vnode.
-/// 1. For each (`vnode`, `current_pos`),
-///    Either insert if no old state,
-///    Or update the state if have old state.
+/// Persists the state per vnode based on `BackfillState`.
+/// We track the current committed state via `committed_progress`
+/// so we know whether we need to persist the state or not.
+///
+/// The state is encoded as follows:
+/// `NotStarted`:
+/// - Not persist to store at all.
+///
+/// `InProgress`:
+/// - Format: | vnode | pk | false |
+/// - If change in current pos: Persist.
+/// - No change in current pos: Do not persist.
+///
+/// Completed
+/// - Format: | vnode | pk | true |
+/// - If previous state is `InProgress` / `NotStarted`: Persist.
+/// - If previous state is Completed: Do not persist.
+/// TODO(kwannoel): we should check committed state to be all `finished` in the tests.
+/// TODO(kwannoel): Instead of persisting state per vnode each time,
+/// we can optimize by persisting state for a subset of vnodes which were updated.
 pub(crate) async fn persist_state_per_vnode<S: StateStore, const IS_REPLICATED: bool>(
     epoch: EpochPair,
     table: &mut StateTableInner<S, BasicSerde, IS_REPLICATED>,
-    is_finished: bool,
-    backfill_state: &BackfillState,
-    committed_progress: &mut HashMap<VirtualNode, Vec<Datum>>,
-    temporary_state: &mut [Datum],
+    backfill_state: &mut BackfillState,
+    #[cfg(debug_assertions)] state_len: usize,
+    vnodes: impl Iterator<Item = VirtualNode>,
 ) -> StreamExecutorResult<()> {
-    // No progress -> No need to commit anything.
-    if backfill_state.has_no_progress() {
-        table.commit_no_data_expected(epoch);
-    }
-
-    for (vnode, backfill_progress) in backfill_state.iter_backfill_progress() {
-        let current_pos = match backfill_progress {
-            BackfillProgressPerVnode::Completed | BackfillProgressPerVnode::NotStarted => {
-                continue;
-            }
-            BackfillProgressPerVnode::InProgress(current_pos) => current_pos,
-        };
-        build_temporary_state_with_vnode(temporary_state, *vnode, is_finished, current_pos);
-
-        let old_state = committed_progress.get(vnode);
-
-        if let Some(old_state) = old_state {
-            // No progress for vnode, means no data
-            if old_state == current_pos.as_inner() {
-                table.commit_no_data_expected(epoch);
-                return Ok(());
-            } else {
-                // There's some progress, update the state.
-                table.write_record(Record::Update {
-                    old_row: &old_state[..],
-                    new_row: &(*temporary_state),
-                });
-                table.commit(epoch).await?;
+    let mut has_progress = false;
+    for vnode in vnodes {
+        if !backfill_state.need_commit(&vnode) {
+            continue;
+        }
+        let (encoded_prev_state, encoded_current_state) =
+            match backfill_state.get_commit_state(&vnode) {
+                Some((old_state, new_state)) => (old_state, new_state),
+                None => continue,
+            };
+        if let Some(encoded_prev_state) = encoded_prev_state {
+            // There's some progress, update the state.
+            #[cfg(debug_assertions)]
+            {
+                let pk: &[Datum; 1] = &[Some(vnode.to_scalar().into())];
+                // old_row only contains the value segment.
+                let old_row = table.get_row(pk).await?;
+                match old_row {
+                    Some(old_row) => {
+                        let inner = old_row.as_inner();
+                        // value segment (without vnode) should be used for comparison
+                        assert_eq!(inner, &encoded_prev_state[1..]);
+                        assert_ne!(inner, &encoded_current_state[1..]);
+                        assert_eq!(old_row.len(), state_len - 1);
+                        assert_eq!(encoded_current_state.len(), state_len);
+                    }
+                    None => {
+                        panic!("row {:#?} not found", pk);
+                    }
+                }
             }
+            table.write_record(Record::Update {
+                old_row: &encoded_prev_state[..],
+                new_row: &encoded_current_state[..],
+            });
+            has_progress = true;
         } else {
             // No existing state, create a new entry.
+            #[cfg(debug_assertions)]
+            {
+                let pk: &[Datum; 1] = &[Some(vnode.to_scalar().into())];
+                let row = table.get_row(pk).await?;
+                assert!(row.is_none(), "row {:#?}", row);
+                assert_eq!(encoded_current_state.len(), state_len);
+            }
             table.write_record(Record::Insert {
-                new_row: &(*temporary_state),
+                new_row: &encoded_current_state[..],
             });
-            table.commit(epoch).await?;
+            has_progress = true;
         }
-        committed_progress.insert(*vnode, current_pos.as_inner().to_vec());
+        backfill_state.mark_committed(vnode);
+    }
+    if has_progress {
+        table.commit(epoch).await?;
+    } else {
+        table.commit_no_data_expected(epoch);
     }
     Ok(())
 }
@@ -550,3 +717,26 @@ pub(crate) async fn persist_state<S: StateStore, const IS_REPLICATED: bool>(
     }
     Ok(())
 }
+
+/// Creates a data chunk builder for snapshot read.
+/// If the `rate_limit` is smaller than `chunk_size`, it will take precedence.
+/// This is so we can partition snapshot read into smaller chunks than chunk size.
+pub fn create_builder(
+    rate_limit: Option<usize>,
+    chunk_size: usize,
+    data_types: Vec<DataType>,
+) -> DataChunkBuilder {
+    if let Some(rate_limit) = rate_limit
+        && rate_limit < chunk_size
+    {
+        DataChunkBuilder::new(
+            data_types,
+            rate_limit,
+        )
+    } else {
+        DataChunkBuilder::new(
+            data_types,
+            chunk_size,
+        )
+    }
+}
diff --git a/src/stream/src/executor/hash_agg.rs b/src/stream/src/executor/hash_agg.rs
index 1478534771738..1fdde9083e15a 100644
--- a/src/stream/src/executor/hash_agg.rs
+++ b/src/stream/src/executor/hash_agg.rs
@@ -49,7 +49,6 @@ use crate::common::StreamChunkBuilder;
 use crate::error::StreamResult;
 use crate::executor::aggregation::AggGroup as GenericAggGroup;
 use crate::executor::error::StreamExecutorError;
-use crate::executor::monitor::StreamingMetrics;
 use crate::executor::{BoxedMessageStream, Executor, Message};
 use crate::task::AtomicU64Ref;
 
@@ -138,8 +137,6 @@ struct ExecutorInner<K: HashKey, S: StateStore> {
 
     /// Should emit on window close according to watermark?
     emit_on_window_close: bool,
-
-    metrics: Arc<StreamingMetrics>,
 }
 
 impl<K: HashKey, S: StateStore> ExecutorInner<K, S> {
@@ -250,7 +247,6 @@ impl<K: HashKey, S: StateStore> HashAggExecutor<K, S> {
                 chunk_size: args.extra.chunk_size,
                 max_dirty_groups_heap_size: args.extra.max_dirty_groups_heap_size,
                 emit_on_window_close: args.extra.emit_on_window_close,
-                metrics: args.metrics,
             },
         })
     }
@@ -399,7 +395,6 @@ impl<K: HashKey, S: StateStore> HashAggExecutor<K, S> {
                     visibilities,
                     &mut this.distinct_dedup_tables,
                     agg_group.group_key(),
-                    this.actor_ctx.clone(),
                 )
                 .await?;
             for ((call, storage), visibility) in (this.agg_calls.iter())
@@ -423,11 +418,13 @@ impl<K: HashKey, S: StateStore> HashAggExecutor<K, S> {
         let actor_id_str = this.actor_ctx.id.to_string();
         let fragment_id_str = this.actor_ctx.fragment_id.to_string();
         let table_id_str = this.intermediate_state_table.table_id().to_string();
-        this.metrics
+        this.actor_ctx
+            .streaming_metrics
             .agg_dirty_groups_count
             .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str])
             .set(vars.dirty_groups.len() as i64);
-        this.metrics
+        this.actor_ctx
+            .streaming_metrics
             .agg_dirty_groups_heap_size
             .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str])
             .set(vars.dirty_groups.estimated_heap_size() as i64);
@@ -526,8 +523,7 @@ impl<K: HashKey, S: StateStore> HashAggExecutor<K, S> {
         }
 
         // Flush distinct dedup state.
-        vars.distinct_dedup
-            .flush(&mut this.distinct_dedup_tables, this.actor_ctx.clone())?;
+        vars.distinct_dedup.flush(&mut this.distinct_dedup_tables)?;
 
         // Evict cache to target capacity.
         vars.agg_group_cache.evict();
@@ -537,23 +533,28 @@ impl<K: HashKey, S: StateStore> HashAggExecutor<K, S> {
         let actor_id_str = this.actor_ctx.id.to_string();
         let fragment_id_str = this.actor_ctx.fragment_id.to_string();
         let table_id_str = this.intermediate_state_table.table_id().to_string();
-        this.metrics
+        this.actor_ctx
+            .streaming_metrics
             .agg_lookup_miss_count
             .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str])
             .inc_by(std::mem::take(&mut vars.stats.lookup_miss_count));
-        this.metrics
+        this.actor_ctx
+            .streaming_metrics
             .agg_total_lookup_count
             .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str])
             .inc_by(std::mem::take(&mut vars.stats.total_lookup_count));
-        this.metrics
+        this.actor_ctx
+            .streaming_metrics
             .agg_cached_entry_count
             .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str])
             .set(vars.agg_group_cache.len() as i64);
-        this.metrics
+        this.actor_ctx
+            .streaming_metrics
             .agg_chunk_lookup_miss_count
             .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str])
             .inc_by(std::mem::take(&mut vars.stats.chunk_lookup_miss_count));
-        this.metrics
+        this.actor_ctx
+            .streaming_metrics
             .agg_chunk_total_lookup_count
             .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str])
             .inc_by(std::mem::take(&mut vars.stats.chunk_total_lookup_count));
@@ -591,7 +592,7 @@ impl<K: HashKey, S: StateStore> HashAggExecutor<K, S> {
         let window_col_idx = this.group_key_indices[window_col_idx_in_group_key];
 
         let agg_group_cache_metrics_info = MetricsInfo::new(
-            this.metrics.clone(),
+            this.actor_ctx.streaming_metrics.clone(),
             this.intermediate_state_table.table_id(),
             this.actor_ctx.id,
             "agg intermediate state table",
@@ -607,10 +608,9 @@ impl<K: HashKey, S: StateStore> HashAggExecutor<K, S> {
             dirty_groups: Default::default(),
             distinct_dedup: DistinctDeduplicater::new(
                 &this.agg_calls,
-                &this.watermark_epoch,
+                this.watermark_epoch.clone(),
                 &this.distinct_dedup_tables,
-                this.actor_ctx.id,
-                this.metrics.clone(),
+                this.actor_ctx.clone(),
             ),
             buffered_watermarks: vec![None; this.group_key_indices.len()],
             window_watermark: None,
diff --git a/src/stream/src/executor/mod.rs b/src/stream/src/executor/mod.rs
index 004b47559d8f7..04754d71807bb 100644
--- a/src/stream/src/executor/mod.rs
+++ b/src/stream/src/executor/mod.rs
@@ -106,6 +106,7 @@ mod utils;
 
 pub use actor::{Actor, ActorContext, ActorContextRef};
 use anyhow::Context;
+pub use backfill::arrangement_backfill::*;
 pub use backfill::cdc::{CdcBackfillExecutor, ExternalStorageTable};
 pub use backfill::no_shuffle_backfill::*;
 pub use barrier_recv::BarrierRecvExecutor;
@@ -844,6 +845,11 @@ pub async fn expect_first_barrier(
     let barrier = message
         .into_barrier()
         .expect("the first message must be a barrier");
+    // TODO: Is this check correct?
+    assert!(matches!(
+        barrier.kind,
+        BarrierKind::Checkpoint | BarrierKind::Initial
+    ));
     Ok(barrier)
 }
 
diff --git a/src/stream/src/executor/simple_agg.rs b/src/stream/src/executor/simple_agg.rs
index 6623a05854e0b..0d33a7dc3074e 100644
--- a/src/stream/src/executor/simple_agg.rs
+++ b/src/stream/src/executor/simple_agg.rs
@@ -25,7 +25,6 @@ use super::agg_common::{AggExecutorArgs, SimpleAggExecutorExtraArgs};
 use super::aggregation::{
     agg_call_filter_res, iter_table_storage, AggStateStorage, AlwaysOutput, DistinctDeduplicater,
 };
-use super::monitor::StreamingMetrics;
 use super::*;
 use crate::common::table::state_table::StateTable;
 use crate::error::StreamResult;
@@ -91,8 +90,6 @@ struct ExecutorInner<S: StateStore> {
 
     /// Extreme state cache size
     extreme_cache_size: usize,
-
-    metrics: Arc<StreamingMetrics>,
 }
 
 impl<S: StateStore> ExecutorInner<S> {
@@ -151,7 +148,6 @@ impl<S: StateStore> SimpleAggExecutor<S> {
                 distinct_dedup_tables: args.distinct_dedup_tables,
                 watermark_epoch: args.watermark_epoch,
                 extreme_cache_size: args.extreme_cache_size,
-                metrics: args.metrics,
             },
         })
     }
@@ -182,7 +178,6 @@ impl<S: StateStore> SimpleAggExecutor<S> {
                 call_visibilities,
                 &mut this.distinct_dedup_tables,
                 None,
-                this.actor_ctx.clone(),
             )
             .await?;
 
@@ -212,8 +207,7 @@ impl<S: StateStore> SimpleAggExecutor<S> {
     ) -> StreamExecutorResult<Option<StreamChunk>> {
         let chunk = if vars.state_changed || vars.agg_group.is_uninitialized() {
             // Flush distinct dedup state.
-            vars.distinct_dedup
-                .flush(&mut this.distinct_dedup_tables, this.actor_ctx.clone())?;
+            vars.distinct_dedup.flush(&mut this.distinct_dedup_tables)?;
 
             // Flush states into intermediate state table.
             let encoded_states = vars.agg_group.encode_states(&this.agg_funcs)?;
@@ -266,10 +260,9 @@ impl<S: StateStore> SimpleAggExecutor<S> {
 
         let mut distinct_dedup = DistinctDeduplicater::new(
             &this.agg_calls,
-            &this.watermark_epoch,
+            this.watermark_epoch.clone(),
             &this.distinct_dedup_tables,
-            this.actor_ctx.id,
-            this.metrics.clone(),
+            this.actor_ctx.clone(),
         );
         distinct_dedup.dedup_caches_mut().for_each(|cache| {
             cache.update_epoch(barrier.epoch.curr);
diff --git a/src/stream/src/executor/test_utils.rs b/src/stream/src/executor/test_utils.rs
index 9e5f7ed036b19..9547744443f48 100644
--- a/src/stream/src/executor/test_utils.rs
+++ b/src/stream/src/executor/test_utils.rs
@@ -293,7 +293,6 @@ pub mod agg_executor {
         AggExecutorArgs, HashAggExecutorExtraArgs, SimpleAggExecutorExtraArgs,
     };
     use crate::executor::aggregation::AggStateStorage;
-    use crate::executor::monitor::StreamingMetrics;
     use crate::executor::{
         ActorContext, ActorContextRef, BoxedExecutor, Executor, ExecutorInfo, HashAggExecutor,
         PkIndices, SimpleAggExecutor,
@@ -498,7 +497,6 @@ pub mod agg_executor {
             intermediate_state_table,
             distinct_dedup_tables: Default::default(),
             watermark_epoch: Arc::new(AtomicU64::new(0)),
-            metrics: Arc::new(StreamingMetrics::unused()),
 
             extra: HashAggExecutorExtraArgs {
                 group_key_indices,
@@ -569,7 +567,6 @@ pub mod agg_executor {
             intermediate_state_table,
             distinct_dedup_tables: Default::default(),
             watermark_epoch: Arc::new(AtomicU64::new(0)),
-            metrics: Arc::new(StreamingMetrics::unused()),
             extra: SimpleAggExecutorExtraArgs {},
         })
         .unwrap()
diff --git a/src/stream/src/from_proto/hash_agg.rs b/src/stream/src/from_proto/hash_agg.rs
index 7b4c70a592417..b79913da18269 100644
--- a/src/stream/src/from_proto/hash_agg.rs
+++ b/src/stream/src/from_proto/hash_agg.rs
@@ -110,7 +110,6 @@ impl ExecutorBuilder for HashAggExecutorBuilder {
                 intermediate_state_table,
                 distinct_dedup_tables,
                 watermark_epoch: stream.get_watermark_epoch(),
-                metrics: params.executor_stats,
                 extra: HashAggExecutorExtraArgs {
                     group_key_indices,
                     chunk_size: params.env.config().developer.chunk_size,
diff --git a/src/stream/src/from_proto/simple_agg.rs b/src/stream/src/from_proto/simple_agg.rs
index a61cf375ae50b..61a5937aa4092 100644
--- a/src/stream/src/from_proto/simple_agg.rs
+++ b/src/stream/src/from_proto/simple_agg.rs
@@ -71,7 +71,6 @@ impl ExecutorBuilder for SimpleAggExecutorBuilder {
             intermediate_state_table,
             distinct_dedup_tables,
             watermark_epoch: stream.get_watermark_epoch(),
-            metrics: params.executor_stats,
             extra: SimpleAggExecutorExtraArgs {},
         })?
         .boxed())
diff --git a/src/stream/src/from_proto/stream_scan.rs b/src/stream/src/from_proto/stream_scan.rs
index f6f35b33f601b..a5dc8abaf7bcb 100644
--- a/src/stream/src/from_proto/stream_scan.rs
+++ b/src/stream/src/from_proto/stream_scan.rs
@@ -16,15 +16,18 @@ use std::sync::Arc;
 
 use risingwave_common::catalog::{ColumnDesc, ColumnId, TableId, TableOption};
 use risingwave_common::util::sort_util::OrderType;
+use risingwave_common::util::value_encoding::column_aware_row_encoding::ColumnAwareSerde;
+use risingwave_common::util::value_encoding::BasicSerde;
 use risingwave_pb::plan_common::StorageTableDesc;
 use risingwave_pb::stream_plan::{StreamScanNode, StreamScanType};
 use risingwave_storage::table::batch_table::storage_table::StorageTable;
 use risingwave_storage::table::Distribution;
 
 use super::*;
-use crate::common::table::state_table::StateTable;
+use crate::common::table::state_table::{ReplicatedStateTable, StateTable};
 use crate::executor::{
-    BackfillExecutor, ChainExecutor, FlowControlExecutor, RearrangedChainExecutor,
+    ArrangementBackfillExecutor, BackfillExecutor, ChainExecutor, FlowControlExecutor,
+    RearrangedChainExecutor,
 };
 
 pub struct StreamScanExecutorBuilder;
@@ -58,6 +61,7 @@ impl ExecutorBuilder for StreamScanExecutorBuilder {
             StreamScanType::Rearrange => {
                 RearrangedChainExecutor::new(params.info, snapshot, upstream, progress).boxed()
             }
+
             StreamScanType::Backfill => {
                 let table_desc: &StorageTableDesc = node.get_table_desc()?;
                 let table_id = TableId {
@@ -116,6 +120,16 @@ impl ExecutorBuilder for StreamScanExecutorBuilder {
                     .collect_vec();
                 let prefix_hint_len = table_desc.get_read_prefix_len_hint() as usize;
                 let versioned = table_desc.versioned;
+
+                let state_table = if let Ok(table) = node.get_state_table() {
+                    Some(
+                        StateTable::from_table_catalog(table, state_store.clone(), vnodes.clone())
+                            .await,
+                    )
+                } else {
+                    None
+                };
+
                 // TODO: refactor it with from_table_catalog in the future.
                 let upstream_table = StorageTable::new_partial(
                     state_store.clone(),
@@ -130,11 +144,6 @@ impl ExecutorBuilder for StreamScanExecutorBuilder {
                     prefix_hint_len,
                     versioned,
                 );
-                let state_table = if let Ok(table) = node.get_state_table() {
-                    Some(StateTable::from_table_catalog(table, state_store, vnodes).await)
-                } else {
-                    None
-                };
 
                 BackfillExecutor::new(
                     params.info,
@@ -149,6 +158,56 @@ impl ExecutorBuilder for StreamScanExecutorBuilder {
                 )
                 .boxed()
             }
+            StreamScanType::ArrangementBackfill => {
+                let column_ids = node
+                    .upstream_column_ids
+                    .iter()
+                    .map(ColumnId::from)
+                    .collect_vec();
+
+                let vnodes = params.vnode_bitmap.map(Arc::new);
+
+                let state_table = node.get_state_table().unwrap();
+                let state_table = StateTable::from_table_catalog(
+                    state_table,
+                    state_store.clone(),
+                    vnodes.clone(),
+                )
+                .await;
+
+                let upstream_table = node.get_arrangement_table().unwrap();
+                let versioned = upstream_table.get_version().is_ok();
+
+                macro_rules! new_executor {
+                    ($SD:ident) => {{
+                        let upstream_table =
+                            ReplicatedStateTable::<_, $SD>::from_table_catalog_with_output_column_ids(
+                                upstream_table,
+                                state_store.clone(),
+                                vnodes,
+                                column_ids,
+                            )
+                            .await;
+                        ArrangementBackfillExecutor::<_, $SD>::new(
+                            params.info,
+                            upstream_table,
+                            upstream,
+                            state_table,
+                            output_indices,
+                            progress,
+                            stream.streaming_metrics.clone(),
+                            params.env.config().developer.chunk_size,
+                            node.rate_limit.map(|x| x as _),
+                        )
+                        .boxed()
+                    }};
+                }
+                if versioned {
+                    new_executor!(ColumnAwareSerde)
+                } else {
+                    new_executor!(BasicSerde)
+                }
+            }
             StreamScanType::Unspecified => unreachable!(),
         };
         Ok(FlowControlExecutor::new(
diff --git a/src/tests/compaction_test/src/delete_range_runner.rs b/src/tests/compaction_test/src/delete_range_runner.rs
index 5982fe818403b..d2acd7c754c74 100644
--- a/src/tests/compaction_test/src/delete_range_runner.rs
+++ b/src/tests/compaction_test/src/delete_range_runner.rs
@@ -63,7 +63,6 @@ use risingwave_storage::store::{
 use risingwave_storage::StateStore;
 
 use crate::CompactionTestOpts;
-
 pub fn start_delete_range(opts: CompactionTestOpts) -> Pin<Box<dyn Future<Output = ()> + Send>> {
     // WARNING: don't change the function signature. Making it `async fn` will cause
     // slow compile in release mode.
@@ -435,13 +434,10 @@ impl NormalState {
             .get(
                 TableKey(Bytes::copy_from_slice(key)),
                 ReadOptions {
-                    prefix_hint: None,
                     ignore_range_tombstone,
-                    retention_seconds: None,
                     table_id: self.table_id,
-                    read_version_from_backup: false,
-                    prefetch_options: Default::default(),
                     cache_policy: CachePolicy::Fill(CachePriority::High),
+                    ..Default::default()
                 },
             )
             .await
@@ -462,13 +458,12 @@ impl NormalState {
                     Bound::Excluded(TableKey(Bytes::copy_from_slice(right))),
                 ),
                 ReadOptions {
-                    prefix_hint: None,
                     ignore_range_tombstone,
-                    retention_seconds: None,
                     table_id: self.table_id,
                     read_version_from_backup: false,
                     prefetch_options: PrefetchOptions::default(),
                     cache_policy: CachePolicy::Fill(CachePriority::High),
+                    ..Default::default()
                 },
             )
             .await
@@ -494,13 +489,12 @@ impl CheckState for NormalState {
                         Bound::Excluded(Bytes::copy_from_slice(right)).map(TableKey),
                     ),
                     ReadOptions {
-                        prefix_hint: None,
                         ignore_range_tombstone: true,
-                        retention_seconds: None,
                         table_id: self.table_id,
                         read_version_from_backup: false,
                         prefetch_options: PrefetchOptions::default(),
                         cache_policy: CachePolicy::Fill(CachePriority::High),
+                        ..Default::default()
                     },
                 )
                 .await
diff --git a/src/tests/simulation/tests/integration_tests/recovery/background_ddl.rs b/src/tests/simulation/tests/integration_tests/recovery/background_ddl.rs
index 3d03aeb4067d4..4e1ef135f839c 100644
--- a/src/tests/simulation/tests/integration_tests/recovery/background_ddl.rs
+++ b/src/tests/simulation/tests/integration_tests/recovery/background_ddl.rs
@@ -73,7 +73,7 @@ async fn cancel_stream_jobs(session: &mut Session) -> Result<Vec<u32>> {
     tracing::info!("cancelling streaming jobs");
     let ids = ids.split('\n').collect::<Vec<_>>().join(",");
     let result = session.run(&format!("cancel jobs {};", ids)).await?;
-    tracing::info!("cancelled streaming jobs, {:#?}", result);
+    tracing::info!("cancelled streaming jobs, {}", result);
     let ids = result
         .split('\n')
         .map(|s| s.parse::<u32>().unwrap())
@@ -195,7 +195,7 @@ async fn test_ddl_cancel() -> Result<()> {
     session.run(CREATE_TABLE).await?;
     session.run(SEED_TABLE_500).await?;
     session.flush().await?;
-    session.run(SET_RATE_LIMIT_2).await?;
+    session.run(SET_RATE_LIMIT_1).await?;
     session.run(SET_BACKGROUND_DDL).await?;
 
     for _ in 0..5 {
@@ -369,3 +369,38 @@ async fn test_sink_create() -> Result<()> {
 
     Ok(())
 }
+
+#[tokio::test]
+async fn test_background_agg_mv_recovery() -> Result<()> {
+    init_logger();
+    let mut cluster = Cluster::start(Configuration::for_background_ddl()).await?;
+    let mut session = cluster.start_session();
+
+    session.run("CREATE TABLE t1 (v1 int)").await?;
+    session
+        .run("INSERT INTO t1 SELECT generate_series FROM generate_series(1, 200);")
+        .await?;
+    session.flush().await?;
+    session.run(SET_RATE_LIMIT_1).await?;
+    session.run(SET_BACKGROUND_DDL).await?;
+    session
+        .run("CREATE MATERIALIZED VIEW mv1 as select v1, count(*) from t1 group by v1;")
+        .await?;
+    sleep(Duration::from_secs(2)).await;
+
+    kill_cn_and_meta_and_wait_recover(&cluster).await;
+
+    // Now just wait for it to complete.
+    session.run(WAIT).await?;
+
+    let t_count = session.run("SELECT COUNT(v1) FROM t1").await?;
+    let mv1_count = session.run("SELECT COUNT(v1) FROM mv1").await?;
+    assert_eq!(t_count, mv1_count);
+
+    // Make sure that if MV killed and restarted
+    // it will not be dropped.
+    session.run("DROP MATERIALIZED VIEW mv1;").await?;
+    session.run("DROP TABLE t1;").await?;
+
+    Ok(())
+}
diff --git a/src/tests/sqlsmith/scripts/extract_queries.sh b/src/tests/sqlsmith/scripts/extract_queries.sh
index ed9d5c9ee1d3f..9abac600296aa 100755
--- a/src/tests/sqlsmith/scripts/extract_queries.sh
+++ b/src/tests/sqlsmith/scripts/extract_queries.sh
@@ -15,7 +15,7 @@ SHRUNK_OUTPUT_FILE="$2".shrunk
 
 echo "--- Extracting queries"
 cat "$LOG_FILE" | rg "\[EXECUTING .*\]" | sed 's/.*\[EXECUTING .*\]: //' | sed 's/$/;/' > "$OUTPUT_FILE"
-echo "--- Extracted queries to $LOG_FILE"
+echo "--- Extracted queries to $OUTPUT_FILE"
 
 echo "--- Shrinking queries"
 cargo run --bin sqlsmith-reducer -- --input-file "$OUTPUT_FILE" --output-file "$SHRUNK_OUTPUT_FILE"