diff --git a/Cargo.lock b/Cargo.lock index b40e128bc676f..ceaaee00b5bad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4648,7 +4648,7 @@ checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" [[package]] name = "local_stats_alloc" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "workspace-hack", ] @@ -6098,7 +6098,7 @@ dependencies = [ [[package]] name = "pgwire" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "auto_enums", @@ -7241,7 +7241,7 @@ dependencies = [ [[package]] name = "risedev" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "chrono", @@ -7270,7 +7270,7 @@ dependencies = [ [[package]] name = "risedev-config" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "clap", @@ -7283,7 +7283,7 @@ dependencies = [ [[package]] name = "risingwave-fields-derive" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "expect-test", "indoc", @@ -7295,7 +7295,7 @@ dependencies = [ [[package]] name = "risingwave_backup" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "async-trait", @@ -7317,7 +7317,7 @@ dependencies = [ [[package]] name = "risingwave_batch" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "assert_matches", @@ -7363,7 +7363,7 @@ dependencies = [ [[package]] name = "risingwave_bench" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "async-trait", "aws-config", @@ -7397,7 +7397,7 @@ dependencies = [ [[package]] name = "risingwave_cmd" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "clap", "madsim-tokio", @@ -7418,7 +7418,7 @@ dependencies = [ [[package]] name = "risingwave_cmd_all" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "clap", @@ -7450,7 +7450,7 @@ dependencies = [ [[package]] name = "risingwave_common" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "arc-swap", @@ -7550,7 +7550,7 @@ dependencies = [ [[package]] name = "risingwave_common_heap_profiling" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "chrono", @@ -7565,7 +7565,7 @@ dependencies = [ [[package]] name = "risingwave_common_proc_macro" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "bae", "proc-macro-error", @@ -7576,7 +7576,7 @@ dependencies = [ [[package]] name = "risingwave_common_service" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "async-trait", "futures", @@ -7597,7 +7597,7 @@ dependencies = [ [[package]] name = "risingwave_compaction_test" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "async-trait", @@ -7624,7 +7624,7 @@ dependencies = [ [[package]] name = "risingwave_compactor" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "async-trait", "await-tree", @@ -7646,7 +7646,7 @@ dependencies = [ [[package]] name = "risingwave_compute" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "async-trait", @@ -7689,7 +7689,7 @@ dependencies = [ [[package]] name = "risingwave_connector" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "apache-avro 0.16.0", @@ -7790,7 +7790,7 @@ dependencies = [ [[package]] name = "risingwave_ctl" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "bytes", @@ -7825,7 +7825,7 @@ dependencies = [ [[package]] name = "risingwave_e2e_extended_mode_test" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "chrono", @@ -7840,7 +7840,7 @@ dependencies = [ [[package]] name = "risingwave_error" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "bincode 1.3.3", "bytes", @@ -7855,7 +7855,7 @@ dependencies = [ [[package]] name = "risingwave_expr" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "arrow-array", @@ -7892,7 +7892,7 @@ dependencies = [ [[package]] name = "risingwave_expr_impl" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "aho-corasick", "anyhow", @@ -7939,7 +7939,7 @@ dependencies = [ [[package]] name = "risingwave_frontend" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "arc-swap", @@ -8009,7 +8009,7 @@ dependencies = [ [[package]] name = "risingwave_hummock_sdk" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "bytes", "easy-ext", @@ -8024,7 +8024,7 @@ dependencies = [ [[package]] name = "risingwave_hummock_test" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "async-trait", "bytes", @@ -8056,7 +8056,7 @@ dependencies = [ [[package]] name = "risingwave_hummock_trace" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "async-trait", "bincode 2.0.0-rc.3", @@ -8120,7 +8120,7 @@ dependencies = [ [[package]] name = "risingwave_mem_table_spill_test" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "async-trait", "bytes", @@ -8136,7 +8136,7 @@ dependencies = [ [[package]] name = "risingwave_meta" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "arc-swap", @@ -8204,7 +8204,7 @@ dependencies = [ [[package]] name = "risingwave_meta_model_migration" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "async-std", "sea-orm-migration", @@ -8213,7 +8213,7 @@ dependencies = [ [[package]] name = "risingwave_meta_model_v2" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "risingwave_pb", "sea-orm", @@ -8223,7 +8223,7 @@ dependencies = [ [[package]] name = "risingwave_meta_node" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "clap", @@ -8253,7 +8253,7 @@ dependencies = [ [[package]] name = "risingwave_meta_service" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "async-trait", @@ -8278,7 +8278,7 @@ dependencies = [ [[package]] name = "risingwave_object_store" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "async-trait", "await-tree", @@ -8311,7 +8311,7 @@ dependencies = [ [[package]] name = "risingwave_pb" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "enum-as-inner", "fs-err", @@ -8331,7 +8331,7 @@ dependencies = [ [[package]] name = "risingwave_planner_test" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "expect-test", @@ -8353,7 +8353,7 @@ dependencies = [ [[package]] name = "risingwave_regress_test" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "clap", @@ -8367,7 +8367,7 @@ dependencies = [ [[package]] name = "risingwave_rpc_client" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "async-trait", @@ -8397,7 +8397,7 @@ dependencies = [ [[package]] name = "risingwave_rt" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "await-tree", "console", @@ -8476,7 +8476,7 @@ dependencies = [ [[package]] name = "risingwave_source" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "assert_matches", @@ -8498,7 +8498,7 @@ dependencies = [ [[package]] name = "risingwave_sqlparser" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "itertools 0.12.0", "matches", @@ -8525,7 +8525,7 @@ dependencies = [ [[package]] name = "risingwave_sqlsmith" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "chrono", @@ -8552,7 +8552,7 @@ dependencies = [ [[package]] name = "risingwave_state_cleaning_test" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "clap", @@ -8572,7 +8572,7 @@ dependencies = [ [[package]] name = "risingwave_storage" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "arc-swap", @@ -8587,6 +8587,7 @@ dependencies = [ "dyn-clone", "either", "enum-as-inner", + "expect-test", "fail", "fiemap", "foyer", @@ -8638,7 +8639,7 @@ dependencies = [ [[package]] name = "risingwave_stream" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "anyhow", "assert_matches", @@ -8698,7 +8699,7 @@ dependencies = [ [[package]] name = "risingwave_test_runner" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "fail", "sync-point", @@ -8725,7 +8726,7 @@ dependencies = [ [[package]] name = "risingwave_variables" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "chrono", "workspace-hack", @@ -11523,11 +11524,11 @@ dependencies = [ [[package]] name = "with_options" -version = "1.3.0-alpha" +version = "1.5.0-alpha" [[package]] name = "workspace-config" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "log", "openssl-sys", @@ -11538,7 +11539,7 @@ dependencies = [ [[package]] name = "workspace-hack" -version = "1.3.0-alpha" +version = "1.5.0-alpha" dependencies = [ "ahash 0.8.3", "allocator-api2", diff --git a/Cargo.toml b/Cargo.toml index 543183b232263..b16f130797705 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,7 +62,7 @@ exclude = ["lints"] resolver = "2" [workspace.package] -version = "1.3.0-alpha" +version = "1.5.0-alpha" edition = "2021" homepage = "https://github.com/risingwavelabs/risingwave" keywords = ["sql", "database", "streaming"] diff --git a/backwards-compat-tests/scripts/utils.sh b/backwards-compat-tests/scripts/utils.sh index 07c06ecfdef38..dc7ea42a0c481 100644 --- a/backwards-compat-tests/scripts/utils.sh +++ b/backwards-compat-tests/scripts/utils.sh @@ -129,37 +129,47 @@ version_lt() { ################################### Entry Points -# Get $OLD_VERSION and $NEW_VERSION for Risingwave -get_rw_versions() { - # For backwards compat test we assume we are testing the latest version of RW (i.e. latest main commit) - # against the Nth latest release candidate, where N > 1. N can be larger, - # in case some old cluster did not upgrade. - local VERSION_OFFSET=4 - - # First we obtain a list of versions from git branch names. - # Then we normalize them to semver format (MAJOR.MINOR.PATCH). - echo "--- git branch origin output" - git branch -r | grep origin - - # Extract X.Y.Z tags - echo "--- VERSION BRANCHES" - local tags=$(git tag | grep -E "^v[0-9]+\.[0-9]+\.[0-9]+$" | tr -d 'v' | tr -d ' ') - echo "$tags" - - # Then we sort them in descending order. - echo "--- VERSIONS" - local sorted_versions=$(echo -e "$tags" | sort -t '.' -n) - echo "$sorted_versions" - - # Then we take the Nth latest version. - # We set $OLD_VERSION to this. - OLD_VERSION=$(echo -e "$sorted_versions" | tail -n $VERSION_OFFSET | head -1) +get_old_version() { + # For backwards compat test we assume we are testing the latest version of RW (i.e. latest main commit) + # against the Nth latest release candidate, where N > 1. N can be larger, + # in case some old cluster did not upgrade. + if [[ -z $VERSION_OFFSET ]] + then + local VERSION_OFFSET=1 + fi + + # First we obtain a list of versions from git branch names. + # Then we normalize them to semver format (MAJOR.MINOR.PATCH). + echo "--- git branch origin output" + git branch -r | grep origin + + # Extract X.Y.Z tags + echo "--- VERSION BRANCHES" + local tags=$(git tag | grep -E "^v[0-9]+\.[0-9]+\.[0-9]+$" | tr -d 'v' | tr -d ' ') + echo "$tags" + + # Then we sort them in descending order. + echo "--- VERSIONS" + local sorted_versions=$(echo -e "$tags" | sort -t '.' -n) + echo "$sorted_versions" + + # Then we take the Nth latest version. + # We set $OLD_VERSION to this. + OLD_VERSION=$(echo -e "$sorted_versions" | tail -n $VERSION_OFFSET | head -1) +} +get_new_version() { # Next, for $NEW_VERSION we just scrape it from `workspace.package.version`. NEW_VERSION=$(cat Cargo.toml | grep "\[workspace\.package\]" -A 5 | sed -n 's/version = \"\([0-9]*\.[0-9]*\.[0-9]*\).*/\1/p' | tr -d ' ') +} + +# Get $OLD_VERSION and $NEW_VERSION for Risingwave +get_rw_versions() { + get_old_version + get_new_version - # Then we assert that `$OLD_VERSION` < `$NEW_VERSION`. - if version_lt "$OLD_VERSION" "$NEW_VERSION" + # Then we assert that `$OLD_VERSION` <= `$NEW_VERSION`. + if version_le "$OLD_VERSION" "$NEW_VERSION" then echo "OLD_VERSION: $OLD_VERSION" echo "NEW_VERSION: $NEW_VERSION" diff --git a/ci/scripts/run-backfill-tests.sh b/ci/scripts/run-backfill-tests.sh index b0010af68c640..dddf88e4b4cac 100755 --- a/ci/scripts/run-backfill-tests.sh +++ b/ci/scripts/run-backfill-tests.sh @@ -96,21 +96,20 @@ restart_cn() { test_snapshot_and_upstream_read() { echo "--- e2e, ci-backfill, test_snapshot_and_upstream_read" cargo make ci-start ci-backfill - - run_sql_file "$PARENT_PATH"/sql/backfill/create_base_table.sql + run_sql_file "$PARENT_PATH"/sql/backfill/basic/create_base_table.sql # Provide snapshot - run_sql_file "$PARENT_PATH"/sql/backfill/insert.sql + run_sql_file "$PARENT_PATH"/sql/backfill/basic/insert.sql # Provide updates ... - run_sql_file "$PARENT_PATH"/sql/backfill/insert.sql & + run_sql_file "$PARENT_PATH"/sql/backfill/basic/insert.sql & # ... and concurrently create mv. - run_sql_file "$PARENT_PATH"/sql/backfill/create_mv.sql & + run_sql_file "$PARENT_PATH"/sql/backfill/basic/create_mv.sql & wait - run_sql_file "$PARENT_PATH"/sql/backfill/select.sql \ + --data-directory hummock_001 \ + --config-path /risingwave.toml\" \ + --compute-opts=\" \ + --config-path /risingwave.toml \ + --listen-addr 0.0.0.0:5688 \ + --prometheus-listener-addr 0.0.0.0:1222 \ + --advertise-addr 0.0.0.0:5688 \ + --async-stack-trace verbose \ + --connector-rpc-endpoint 0.0.0.0:50051 \ + # --parallelism 4 \ + --role both \ + --meta-address http://0.0.0.0:5690\" \ + --frontend-opts=\" \ + --config-path /risingwave.toml \ + --listen-addr 0.0.0.0:4566 \ + --advertise-addr 0.0.0.0:4566 \ + --prometheus-listener-addr 0.0.0.0:2222 \ + --health-check-listener-addr 0.0.0.0:6786 \ + --meta-addr http://0.0.0.0:5690\" \ + --compactor-opts=\" \ + --listen-addr 0.0.0.0:6660 \ + --prometheus-listener-addr 0.0.0.0:1260 \ + --advertise-addr 0.0.0.0:6660 \ + --meta-address http://0.0.0.0:5690\"" + expose: + - "6660" + - "1260" + - "4566" + - "5688" + - "1222" + - "5690" + - "1250" + - "5691" + - "2222" + ports: + - "4566:4566" + - "5690:5690" + - "5691:5691" + - "1222:1222" + - "1250:1250" + - "1260:1260" + - "2222:2222" + depends_on: + - etcd-0 + volumes: + - "./risingwave.toml:/risingwave.toml" + environment: + RUST_BACKTRACE: "1" + # If ENABLE_TELEMETRY is not set, telemetry will start by default + ENABLE_TELEMETRY: ${ENABLE_TELEMETRY:-true} + container_name: risingwave-standalone + healthcheck: + test: + - CMD-SHELL + - bash -c 'printf \"GET / HTTP/1.1\n\n\" > /dev/tcp/127.0.0.1/6660; exit $$?;' + - bash -c 'printf \"GET / HTTP/1.1\n\n\" > /dev/tcp/127.0.0.1/5688; exit $$?;' + - bash -c 'printf \"GET / HTTP/1.1\n\n\" > /dev/tcp/127.0.0.1/4566; exit $$?;' + - bash -c 'printf \"GET / HTTP/1.1\n\n\" > /dev/tcp/127.0.0.1/5690; exit $$?;' + interval: 1s + timeout: 5s + restart: always + deploy: + resources: + limits: + memory: + reservations: + memory: + etcd-0: + extends: + file: docker-compose.yml + service: etcd-0 + grafana-0: + extends: + file: docker-compose.yml + service: grafana-0 + prometheus-0: + extends: + file: docker-compose.yml + service: prometheus-0 +volumes: + etcd-0: + external: false + grafana-0: + external: false + prometheus-0: + external: false diff --git a/integration_tests/big-query-sink/README.md b/integration_tests/big-query-sink/README.md index 1f06d3dfe1172..78c20a9866904 100644 --- a/integration_tests/big-query-sink/README.md +++ b/integration_tests/big-query-sink/README.md @@ -23,9 +23,9 @@ CREATE table '${project_id}'.'${dataset_id}'.'${table_id}'( 4. Execute the SQL queries in sequence: -- append-only/create_source.sql -- append-only/create_mv.sql -- append-only/create_sink.sql +- create_source.sql +- create_mv.sql +- create_sink.sql 1. We need to obtain the JSON file for Google Cloud service accounts, which can be configured here: https://console.cloud.google.com/iam-admin/serviceaccounts. 2. Because BigQuery has limited support for updates and deletes, we currently only support 'append only' diff --git a/integration_tests/big-query-sink/append-only-sql/create_mv.sql b/integration_tests/big-query-sink/create_mv.sql similarity index 100% rename from integration_tests/big-query-sink/append-only-sql/create_mv.sql rename to integration_tests/big-query-sink/create_mv.sql diff --git a/integration_tests/big-query-sink/append-only-sql/create_sink.sql b/integration_tests/big-query-sink/create_sink.sql similarity index 100% rename from integration_tests/big-query-sink/append-only-sql/create_sink.sql rename to integration_tests/big-query-sink/create_sink.sql diff --git a/integration_tests/big-query-sink/append-only-sql/create_source.sql b/integration_tests/big-query-sink/create_source.sql similarity index 100% rename from integration_tests/big-query-sink/append-only-sql/create_source.sql rename to integration_tests/big-query-sink/create_source.sql diff --git a/integration_tests/big-query-sink/prepare.sh b/integration_tests/big-query-sink/prepare.sh new file mode 100755 index 0000000000000..ca9cc3284939d --- /dev/null +++ b/integration_tests/big-query-sink/prepare.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -euo pipefail + +# set gcloud +docker compose exec gcloud-cli gcloud auth login --cred-file=/gcp-rwctest.json + +docker compose exec gcloud-cli gcloud config set project rwctest + +bq_prepare_file='bq_prepare.sql' +bq_prepare_content=$(cat $bq_prepare_file) + +docker compose exec gcloud-cli bq query --use_legacy_sql=false "$bq_prepare_content" + +sleep 10 diff --git a/integration_tests/big-query-sink/sink_check b/integration_tests/big-query-sink/sink_check deleted file mode 100644 index 14cfded736c5e..0000000000000 --- a/integration_tests/big-query-sink/sink_check +++ /dev/null @@ -1 +0,0 @@ -rwctest.bqtest.bq_sink diff --git a/integration_tests/big-query-sink/sink_check.py b/integration_tests/big-query-sink/sink_check.py new file mode 100644 index 0000000000000..7388c853c11cd --- /dev/null +++ b/integration_tests/big-query-sink/sink_check.py @@ -0,0 +1,25 @@ +import json +import subprocess +import sys + +relations = ['rwctest.bqtest.bq_sink'] + +failed_cases = [] +for rel in relations: + sql = f"SELECT COUNT(*) AS count FROM `{rel}`" + print(f"run sql: {sql} on Bigquery") + rows = subprocess.check_output( + ["docker", "compose", "exec", "gcloud-cli", "bq", "query", "--use_legacy_sql=false", "--format=json", sql], + ) + rows = int(json.loads(rows.decode("utf-8").strip())[0]['count']) + print(f"{rows} rows in {rel}") + if rows < 1: + failed_cases.append(rel) + + drop_sql = f"DROP TABLE IF EXISTS `{rel}`" + subprocess.run(["docker", "compose", "exec", "gcloud-cli", "bq", "query", "--use_legacy_sql=false", drop_sql], + check=True) + +if len(failed_cases) != 0: + print(f"Data check failed for case {failed_cases}") + sys.exit(1) diff --git a/integration_tests/cassandra-and-scylladb-sink/prepare.sh b/integration_tests/cassandra-and-scylladb-sink/prepare.sh new file mode 100755 index 0000000000000..690537d878208 --- /dev/null +++ b/integration_tests/cassandra-and-scylladb-sink/prepare.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -euo pipefail + +# wait for cassandra and scylladb to start up +sleep 60 + +# setup cassandra +docker compose exec cassandra cqlsh -f prepare_cassandra_and_scylladb.sql + +# setup scylladb +docker compose exec scylladb cqlsh -f prepare_cassandra_and_scylladb.sql diff --git a/integration_tests/cassandra-and-scylladb-sink/sink_check b/integration_tests/cassandra-and-scylladb-sink/sink_check deleted file mode 100644 index 49a88f8df2245..0000000000000 --- a/integration_tests/cassandra-and-scylladb-sink/sink_check +++ /dev/null @@ -1 +0,0 @@ -demo.demo_bhv_table diff --git a/integration_tests/cassandra-and-scylladb-sink/sink_check.py b/integration_tests/cassandra-and-scylladb-sink/sink_check.py new file mode 100644 index 0000000000000..2087e002d9f44 --- /dev/null +++ b/integration_tests/cassandra-and-scylladb-sink/sink_check.py @@ -0,0 +1,40 @@ +import subprocess +import sys +from time import sleep + +sleep(30) + +relations = ['demo.demo_bhv_table'] + +dbs = ['cassandra', 'scylladb'] +failed_cases = [] +for rel in relations: + sql = f'select count(*) from {rel};' + for db in dbs: + print(f"Running SQL: {sql} on {db}") + query_output_file_name = f"query_{db}_output.txt" + query_output_file = open(query_output_file_name, "wb+") + + subprocess.run(["docker", "compose", "exec", db, "cqlsh", "-e", sql], check=True, + stdout=query_output_file) + + # output file: + # + # count + # ------- + # 1000 + # + # (1 rows) + query_output_file.seek(0) + lines = query_output_file.readlines() + query_output_file.close() + assert len(lines) >= 6 + assert lines[1].decode('utf-8').strip().lower() == 'count' + rows = int(lines[3].decode('utf-8').strip()) + print(f"{rows} rows in {db}.{rel}") + if rows < 1: + failed_cases.append(db + "_" + rel) + +if len(failed_cases) != 0: + print(f"Data check failed for case {failed_cases}") + sys.exit(1) diff --git a/integration_tests/clickhouse-sink/README.md b/integration_tests/clickhouse-sink/README.md index a383f3fba5ee4..efcf995fb3df8 100644 --- a/integration_tests/clickhouse-sink/README.md +++ b/integration_tests/clickhouse-sink/README.md @@ -14,7 +14,7 @@ The cluster contains a RisingWave cluster and its necessary dependencies, a data 2. Create the ClickHouse table: ```sh -docker compose exec clickhouse-server bash /opt/clickhouse/clickhouse-sql/run-sql-file.sh create_clickhouse_table +./prepare.sh ``` 3. Execute the SQL queries in sequence: @@ -28,8 +28,7 @@ We only support `upsert` with clickhouse' `CollapsingMergeTree` and `VersionedCo 4. Execute a simple query: ```sh -docker compose exec clickhouse-server bash /opt/clickhouse/clickhouse-sql/run-sql-file.sh clickhouse_query - +docker compose exec clickhouse-server clickhouse-client ``` ```sql diff --git a/integration_tests/clickhouse-sink/clickhouse-sql/clickhouse_query.sql b/integration_tests/clickhouse-sink/clickhouse-sql/clickhouse_query.sql deleted file mode 100644 index a349770369552..0000000000000 --- a/integration_tests/clickhouse-sink/clickhouse-sql/clickhouse_query.sql +++ /dev/null @@ -1 +0,0 @@ -select user_id, count(*) from default.demo_test group by user_id limit 10 diff --git a/integration_tests/clickhouse-sink/clickhouse-sql/run-sql-file.sh b/integration_tests/clickhouse-sink/clickhouse-sql/run-sql-file.sh deleted file mode 100644 index a122d09dcd424..0000000000000 --- a/integration_tests/clickhouse-sink/clickhouse-sql/run-sql-file.sh +++ /dev/null @@ -1,3 +0,0 @@ -set -ex - -clickhouse-client < /opt/clickhouse/clickhouse-sql/$1.sql \ No newline at end of file diff --git a/integration_tests/clickhouse-sink/clickhouse-sql/create_clickhouse_table.sql b/integration_tests/clickhouse-sink/clickhouse_prepare.sql similarity index 100% rename from integration_tests/clickhouse-sink/clickhouse-sql/create_clickhouse_table.sql rename to integration_tests/clickhouse-sink/clickhouse_prepare.sql diff --git a/integration_tests/clickhouse-sink/docker-compose.yml b/integration_tests/clickhouse-sink/docker-compose.yml index 8129c7d618daf..76b0f7fe607f5 100644 --- a/integration_tests/clickhouse-sink/docker-compose.yml +++ b/integration_tests/clickhouse-sink/docker-compose.yml @@ -12,7 +12,7 @@ services: expose: - 9009 volumes: - - ./clickhouse-sql:/opt/clickhouse/clickhouse-sql + - ./clickhouse_prepare.sql:/clickhouse_prepare.sql risingwave-standalone: extends: file: ../../docker/docker-compose.yml @@ -33,10 +33,6 @@ services: extends: file: ../../docker/docker-compose.yml service: prometheus-0 - message_queue: - extends: - file: ../../docker/docker-compose.yml - service: message_queue volumes: risingwave-standalone: external: false diff --git a/integration_tests/clickhouse-sink/prepare.sh b/integration_tests/clickhouse-sink/prepare.sh new file mode 100755 index 0000000000000..cb8ec629b254e --- /dev/null +++ b/integration_tests/clickhouse-sink/prepare.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +set -euo pipefail + +# setup clickhouse +docker compose exec clickhouse-server bash -c "clickhouse-client < /clickhouse_prepare.sql" diff --git a/integration_tests/clickhouse-sink/sink_check.py b/integration_tests/clickhouse-sink/sink_check.py new file mode 100644 index 0000000000000..bb18e7e93ddd7 --- /dev/null +++ b/integration_tests/clickhouse-sink/sink_check.py @@ -0,0 +1,22 @@ +import subprocess +import sys +from time import sleep + +sleep(30) + +relations = ['default.demo_test'] + +failed_cases = [] +for rel in relations: + sql = f"SELECT COUNT(*) FROM {rel};" + print(f"Running SQL: {sql} ON ClickHouse") + command = f'clickhouse-client -q "{sql}"' + rows = subprocess.check_output(["docker", "compose", "exec", "clickhouse-server", "bash", "-c", command]) + rows = int(rows.decode('utf-8').strip()) + print(f"{rows} rows in {rel}") + if rows < 1: + failed_cases.append(rel) + +if len(failed_cases) != 0: + print(f"Data check failed for case {failed_cases}") + sys.exit(1) diff --git a/integration_tests/cockroach-sink/README.md b/integration_tests/cockroach-sink/README.md index 5792c08021be8..c7ef841ea475b 100644 --- a/integration_tests/cockroach-sink/README.md +++ b/integration_tests/cockroach-sink/README.md @@ -5,9 +5,8 @@ This demo showcases how to sink RisingWave's data to an external CockroachDB. A During CI, the integration test will: 1. Run `docker compose up -d` and start the cluster. -2. After 20-30s, run `create_source.sql`. -3. After 10s, run `create_mv.sql`. -4. After another 10s, the tester will check if the ingestion is successful by creating a materialized view upon the source. It also checks if the MV created in the 3rd step has persisted the data. +2. After 20-30s, run `create_source.sql`, `create_mv.sql`, `create_sink.sql` +3. After another 30s, the tester will check if the ingestion is successful by `SELECT COUNT(*) FROM target_count;` in CockroachDB. To connect to the Postgres outside the container via psql: diff --git a/integration_tests/cockroach-sink/postgres_prepare.sql b/integration_tests/cockroach-sink/cockroach_prepare.sql similarity index 100% rename from integration_tests/cockroach-sink/postgres_prepare.sql rename to integration_tests/cockroach-sink/cockroach_prepare.sql diff --git a/integration_tests/cockroach-sink/create_mv.sql b/integration_tests/cockroach-sink/create_mv.sql index 29fdfa5cfdc4c..2cba41795922f 100644 --- a/integration_tests/cockroach-sink/create_mv.sql +++ b/integration_tests/cockroach-sink/create_mv.sql @@ -6,13 +6,3 @@ FROM user_behaviors GROUP BY target_id; - -CREATE SINK target_count_postgres_sink -FROM - target_count WITH ( - connector = 'jdbc', - jdbc.url = 'jdbc:postgresql://cockroachdb:26257/defaultdb?user=root', - table.name = 'target_count', - type = 'upsert', - primary_key = 'target_id' - ); diff --git a/integration_tests/cockroach-sink/create_sink.sql b/integration_tests/cockroach-sink/create_sink.sql new file mode 100644 index 0000000000000..87c767f3dc3bd --- /dev/null +++ b/integration_tests/cockroach-sink/create_sink.sql @@ -0,0 +1,20 @@ +CREATE SINK target_count_postgres_sink +FROM + target_count WITH ( + connector = 'jdbc', + jdbc.url = 'jdbc:postgresql://cockroachdb:26257/defaultdb?user=root', + table.name = 'target_count', + type = 'upsert', + primary_key = 'target_id' + ); + +-- sink data_type table to pg +CREATE SINK data_types_postgres_sink +FROM + data_types WITH ( + connector = 'jdbc', + jdbc.url = 'jdbc:postgresql://cockroachdb:26257/defaultdb?user=root', + table.name = 'data_types', + type='upsert', + primary_key = 'id' +); diff --git a/integration_tests/cockroach-sink/create_source.sql b/integration_tests/cockroach-sink/create_source.sql index b37504e75dcce..68308df89ce9b 100644 --- a/integration_tests/cockroach-sink/create_source.sql +++ b/integration_tests/cockroach-sink/create_source.sql @@ -1,4 +1,4 @@ -CREATE SOURCE user_behaviors ( +CREATE TABLE user_behaviors ( user_id VARCHAR, target_id VARCHAR, target_type VARCHAR, @@ -7,10 +7,11 @@ CREATE SOURCE user_behaviors ( parent_target_type VARCHAR, parent_target_id VARCHAR ) WITH ( - connector = 'kafka', - topic = 'user_behaviors', - properties.bootstrap.server = 'message_queue:29092', - scan.startup.mode = 'earliest' + connector = 'datagen', + fields.user_id.kind = 'sequence', + fields.user_id.start = 1, + fields.user_id.end = 100, + datagen.rows.per.second = '100' ) FORMAT PLAIN ENCODE JSON; CREATE TABLE data_types ( @@ -34,17 +35,6 @@ CREATE TABLE data_types ( array_column VARCHAR[] ); --- sink data_type table to pg -CREATE SINK data_types_postgres_sink -FROM - data_types WITH ( - connector = 'jdbc', - jdbc.url = 'jdbc:postgresql://cockroachdb:26257/defaultdb?user=root', - table.name = 'data_types', - type='upsert', - primary_key = 'id' -); - INSERT INTO data_types (id, varchar_column, text_column, integer_column, smallint_column, bigint_column, decimal_column, real_column, double_column, boolean_column, date_column, time_column, timestamp_column, timestamptz_column, interval_column, jsonb_column, bytea_column, array_column) VALUES (1, 'Varchar value 1', 'Text value 1', 123, 456, 789, 12.34, 56.78, 90.12, TRUE, '2023-05-22', '12:34:56', '2023-05-22 12:34:56', '2023-05-22 12:34:56+00:00', '1 day', '{"key": "value"}', E'\\xDEADBEEF', ARRAY['Value 1', 'Value 2']), @@ -52,4 +42,3 @@ VALUES (3, 'Varchar value 3', 'Text value 3', 345, 678, 901, 34.56, 78.90, 12.34, TRUE, '2023-05-24', '12:34:56', '2023-05-24 12:34:56', '2023-05-24 12:34:56+00:00', '3 days', '{"key": "value3"}', E'\\xCAFEBABE', ARRAY['Value 5', 'Value 6']), (4, 'Varchar value 4', 'Text value 4', 456, 789, 012, 45.67, 89.01, 23.45, FALSE, '2023-05-25', '23:45:01', '2023-05-25 23:45:01', '2023-05-25 23:45:01+00:00', '4 days', '{"key": "value4"}', E'\\xBABEC0DE', ARRAY['Value 7', 'Value 8']), (5, 'Varchar value 5', 'Text value 5', 567, 890, 123, 56.78, 90.12, 34.56, TRUE, '2023-05-26', '12:34:56', '2023-05-26 12:34:56', '2023-05-26 12:34:56+00:00', '5 days', '{"key": "value5"}', E'\\xDEADBABE', ARRAY['Value 9', 'Value 10']); - diff --git a/integration_tests/cockroach-sink/data_check b/integration_tests/cockroach-sink/data_check deleted file mode 100644 index 3835eb979b86e..0000000000000 --- a/integration_tests/cockroach-sink/data_check +++ /dev/null @@ -1 +0,0 @@ -user_behaviors,target_count \ No newline at end of file diff --git a/integration_tests/cockroach-sink/docker-compose.yml b/integration_tests/cockroach-sink/docker-compose.yml index cde3ef8742815..a205dca9e19cf 100644 --- a/integration_tests/cockroach-sink/docker-compose.yml +++ b/integration_tests/cockroach-sink/docker-compose.yml @@ -21,19 +21,6 @@ services: extends: file: ../../docker/docker-compose.yml service: prometheus-0 - message_queue: - extends: - file: ../../docker/docker-compose.yml - service: message_queue - datagen: - build: ../datagen - depends_on: [message_queue] - command: - - /bin/sh - - -c - - /datagen --mode clickstream --qps 2 kafka --brokers message_queue:29092 - restart: always - container_name: datagen cockroachdb: image: cockroachdb/cockroach:v23.1.11 command: start-single-node --insecure @@ -42,17 +29,11 @@ services: - "8080:8080" # CockroachDB Web UI port restart: always container_name: cockroachdb - prepare_postgres: - image: postgres - depends_on: - - cockroachdb - command: - - /bin/sh - - -c - - "psql postgresql://root@cockroachdb:26257/defaultdb < postgres_prepare.sql" + postgres: + image: postgres:latest + command: tail -f /dev/null volumes: - - "./postgres_prepare.sql:/postgres_prepare.sql" - container_name: prepare_postgres + - "./cockroach_prepare.sql:/cockroach_prepare.sql" restart: on-failure volumes: risingwave-standalone: diff --git a/integration_tests/cockroach-sink/prepare.sh b/integration_tests/cockroach-sink/prepare.sh new file mode 100755 index 0000000000000..bf9c0e8103d45 --- /dev/null +++ b/integration_tests/cockroach-sink/prepare.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +set -euo pipefail + +# setup cockroach +docker compose exec postgres bash -c "psql postgresql://root@cockroachdb:26257/defaultdb < cockroach_prepare.sql" diff --git a/integration_tests/cockroach-sink/query.sql b/integration_tests/cockroach-sink/query.sql deleted file mode 100644 index e09c66a255f10..0000000000000 --- a/integration_tests/cockroach-sink/query.sql +++ /dev/null @@ -1,6 +0,0 @@ -SELECT - * -FROM - target_count -LIMIT - 10; \ No newline at end of file diff --git a/integration_tests/cockroach-sink/sink_check.py b/integration_tests/cockroach-sink/sink_check.py new file mode 100644 index 0000000000000..41c6c34e7da39 --- /dev/null +++ b/integration_tests/cockroach-sink/sink_check.py @@ -0,0 +1,20 @@ +import subprocess +import sys + +relations = ['target_count', 'data_types'] + +failed_cases = [] +for rel in relations: + sql = f'SELECT COUNT(*) FROM {rel};' + print(f"Running SQL: {sql} ON cockroach") + command = f'psql -U root -h cockroachdb -p 26257 -d defaultdb --tuples-only -c "{sql}"' + rows = subprocess.check_output( + ["docker", "compose", "exec", "postgres", "bash", "-c", command]) + rows = int(rows.decode('utf-8').strip()) + print(f"{rows} rows in {rel}") + if rows < 1: + failed_cases.append(rel) + +if len(failed_cases) != 0: + print(f"Data check failed for case {failed_cases}") + sys.exit(1) diff --git a/integration_tests/elasticsearch-sink/sink_check b/integration_tests/elasticsearch-sink/sink_check deleted file mode 100644 index 9daeafb9864cf..0000000000000 --- a/integration_tests/elasticsearch-sink/sink_check +++ /dev/null @@ -1 +0,0 @@ -test diff --git a/integration_tests/elasticsearch-sink/sink_check.py b/integration_tests/elasticsearch-sink/sink_check.py new file mode 100644 index 0000000000000..0e6ad8eda4da4 --- /dev/null +++ b/integration_tests/elasticsearch-sink/sink_check.py @@ -0,0 +1,22 @@ +import json +import subprocess +import sys + +relations = ['test'] + +failed_cases = [] +versions = ['7', '8'] +for rel in relations: + query = f'curl -XGET -u elastic:risingwave "http://localhost:9200/{rel}/_count" -H "Content-Type: application/json"' + for v in versions: + es = 'elasticsearch{}'.format(v) + print(f"Running Query: {query} on {es}") + counts = subprocess.check_output(["docker", "compose", "exec", es, "bash", "-c", query]) + counts = json.loads(counts)['count'] + print("{} counts in {}_{}".format(counts, es, rel)) + if counts < 1: + failed_cases.append(es + '_' + rel) + +if len(failed_cases) != 0: + print(f"Data check failed for case {failed_cases}") + sys.exit(1) diff --git a/integration_tests/iceberg-sink/prepare.sh b/integration_tests/iceberg-sink/prepare.sh new file mode 100755 index 0000000000000..f95aa2cbee250 --- /dev/null +++ b/integration_tests/iceberg-sink/prepare.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +set -euo pipefail + +# setup +docker compose exec spark bash /spark-script/run-sql-file.sh create-table diff --git a/integration_tests/iceberg-sink/sink_check.py b/integration_tests/iceberg-sink/sink_check.py new file mode 100644 index 0000000000000..74c45c6d08bb2 --- /dev/null +++ b/integration_tests/iceberg-sink/sink_check.py @@ -0,0 +1,23 @@ +import subprocess +from time import sleep + +sleep(60) + +query_sql = open("iceberg-query.sql").read() + +print("querying iceberg with presto sql: %s" % query_sql) + +query_output_file_name = "query_output.txt" + +query_output_file = open(query_output_file_name, "wb") + +subprocess.run( + ["docker", "compose", "exec", "presto", "presto-cli", "--server", "localhost:8080", "--execute", query_sql], + check=True, stdout=query_output_file) +query_output_file.close() + +output_content = open(query_output_file_name).read() + +print(output_content) + +assert len(output_content.strip()) > 0 diff --git a/integration_tests/kafka-cdc-sink/pg_check b/integration_tests/kafka-cdc-sink/pg_check deleted file mode 100644 index cd31705b2b725..0000000000000 --- a/integration_tests/kafka-cdc-sink/pg_check +++ /dev/null @@ -1 +0,0 @@ -counts,flinkcounts,types,flink_types diff --git a/integration_tests/kafka-cdc-sink/sink_check.py b/integration_tests/kafka-cdc-sink/sink_check.py new file mode 100644 index 0000000000000..b27472f0cacc1 --- /dev/null +++ b/integration_tests/kafka-cdc-sink/sink_check.py @@ -0,0 +1,24 @@ +import subprocess +import sys +from time import sleep + +# wait for one and a half minutes for the flink test pipeline +print("wait for one minute for ingestion") +sleep(60) + +relations = ['counts', 'flinkcounts', 'types', 'flink_types'] + +failed_cases = [] +for rel in relations: + sql = f'SELECT COUNT(*) FROM {rel};' + print(f"Running SQL: {sql} on PG") + command = f'psql -U $POSTGRES_USER $POSTGRES_DB --tuples-only -c "{sql}"' + rows = subprocess.check_output(["docker", "exec", "postgres", "bash", "-c", command]) + rows = int(rows.decode('utf8').strip()) + print(f"{rows} rows in {rel}") + if rows < 1: + failed_cases.append(rel) + +if len(failed_cases) != 0: + print(f"Data check failed for case {failed_cases}") + sys.exit(1) diff --git a/integration_tests/mindsdb/query_sink.sh b/integration_tests/mindsdb/query_sink.sh deleted file mode 100644 index fdfd9bf910576..0000000000000 --- a/integration_tests/mindsdb/query_sink.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -set -x # Enable printing of each command - -# The model creation may take a long time. Our estimate is 30 seconds. But it can be longer in lower-perf machines. -sleep 30 - -QUERY='SELECT rental_price FROM home_rentals_model WHERE number_of_bathrooms = 2 AND sqft = 1000;' -psql -h localhost -p 55432 -U mindsdb -d mindsdb -c "$QUERY" \ No newline at end of file diff --git a/integration_tests/mindsdb/sink_check.py b/integration_tests/mindsdb/sink_check.py new file mode 100644 index 0000000000000..20ab0db9f9137 --- /dev/null +++ b/integration_tests/mindsdb/sink_check.py @@ -0,0 +1,7 @@ +import subprocess + +# The model creation may take a long time. Our estimate is 30 seconds. But it can be longer in lower-perf machines. + +sql = "SELECT rental_price FROM home_rentals_model WHERE number_of_bathrooms = 2 AND sqft = 1000;" + +subprocess.run(["psql", "-h", "localhost", "-p", "55432", "-U", "mindsdb", "-d", "mindsdb", "-c", sql], check=True) diff --git a/integration_tests/mysql-sink/create_mv.sql b/integration_tests/mysql-sink/create_mv.sql index 72d6bf833c6e8..2cba41795922f 100644 --- a/integration_tests/mysql-sink/create_mv.sql +++ b/integration_tests/mysql-sink/create_mv.sql @@ -6,42 +6,3 @@ FROM user_behaviors GROUP BY target_id; - -CREATE SINK target_count_mysql_sink -FROM - target_count WITH ( - connector = 'jdbc', - jdbc.url = 'jdbc:mysql://mysql:3306/mydb?user=root&password=123456', - table.name = 'target_count', - type = 'upsert', - primary_key = 'target_id' - ); - --- ingest the table back to RW -CREATE TABLE rw_typed_data ( - id BIGINT PRIMARY KEY, - varchar_column VARCHAR, - text_column TEXT, - integer_column INTEGER, - smallint_column SMALLINT, - bigint_column BIGINT, - decimal_column DECIMAL, - real_column REAL, - double_column DOUBLE PRECISION, - boolean_column BOOLEAN, - date_column DATE, - time_column TIME, - timestamp_column TIMESTAMP, - timestamptz_column TIMESTAMPTZ, - jsonb_column JSONB, - bytea_column BYTEA -) WITH ( - connector = 'mysql-cdc', - hostname = 'mysql', - port = '3306', - username = 'root', - password = '123456', - database.name = 'mydb', - table.name = 'data_types', - server.id = '3' -); diff --git a/integration_tests/mysql-sink/create_sink.sql b/integration_tests/mysql-sink/create_sink.sql new file mode 100644 index 0000000000000..bfe9bf6c0b70e --- /dev/null +++ b/integration_tests/mysql-sink/create_sink.sql @@ -0,0 +1,19 @@ +CREATE SINK target_count_mysql_sink +FROM + target_count WITH ( + connector = 'jdbc', + jdbc.url = 'jdbc:mysql://mysql:3306/mydb?user=root&password=123456', + table.name = 'target_count', + type = 'upsert', + primary_key = 'target_id' + ); + +CREATE SINK data_types_mysql_sink +FROM + data_types WITH ( + connector = 'jdbc', + jdbc.url = 'jdbc:mysql://mysql:3306/mydb?user=root&password=123456', + table.name = 'data_types', + type = 'upsert', + primary_key = 'id' + ); diff --git a/integration_tests/mysql-sink/create_source.sql b/integration_tests/mysql-sink/create_source.sql index eb13c5a37cf83..f049457aa3121 100644 --- a/integration_tests/mysql-sink/create_source.sql +++ b/integration_tests/mysql-sink/create_source.sql @@ -7,10 +7,11 @@ CREATE SOURCE user_behaviors ( parent_target_type VARCHAR, parent_target_id VARCHAR ) WITH ( - connector = 'kafka', - topic = 'user_behaviors', - properties.bootstrap.server = 'message_queue:29092', - scan.startup.mode = 'earliest' + connector = 'datagen', + fields.user_id.kind = 'sequence', + fields.user_id.start = 1, + fields.user_id.end = 100, + datagen.rows.per.second = '100' ) FORMAT PLAIN ENCODE JSON; CREATE TABLE data_types ( @@ -32,16 +33,6 @@ CREATE TABLE data_types ( bytea_column BYTEA ); -CREATE SINK data_types_mysql_sink -FROM - data_types WITH ( - connector = 'jdbc', - jdbc.url = 'jdbc:mysql://mysql:3306/mydb?user=root&password=123456', - table.name = 'data_types', - type = 'upsert', - primary_key = 'id' - ); - INSERT INTO data_types (id, varchar_column, text_column, integer_column, smallint_column, bigint_column, decimal_column, real_column, double_column, boolean_column, date_column, time_column, timestamp_column, timestamptz_column, jsonb_column, bytea_column) VALUES (1, 'Varchar value 1', 'Text value 1', 123, 456, 789, 12.34, 56.78, 90.12, TRUE, '2023-05-22', '12:34:56', '2023-05-22 12:34:56', '2023-05-22T12:34:56Z', '{"key": "value"}', E'\\xDEADBEEF'), diff --git a/integration_tests/mysql-sink/data_check b/integration_tests/mysql-sink/data_check deleted file mode 100644 index 0f8b2d5166847..0000000000000 --- a/integration_tests/mysql-sink/data_check +++ /dev/null @@ -1 +0,0 @@ -user_behaviors,target_count,rw_typed_data \ No newline at end of file diff --git a/integration_tests/mysql-sink/docker-compose.yml b/integration_tests/mysql-sink/docker-compose.yml index 9b946514eb4aa..97d3d78ce4cb0 100644 --- a/integration_tests/mysql-sink/docker-compose.yml +++ b/integration_tests/mysql-sink/docker-compose.yml @@ -21,12 +21,8 @@ services: extends: file: ../../docker/docker-compose.yml service: prometheus-0 - message_queue: - extends: - file: ../../docker/docker-compose.yml - service: message_queue mysql: - image: mysql:8.0 + image: mysql:latest ports: - "3306:3306" environment: @@ -34,33 +30,14 @@ services: - MYSQL_USER=mysqluser - MYSQL_PASSWORD=mysqlpw - MYSQL_DATABASE=mydb + volumes: + - "./mysql_prepare.sql:/mysql_prepare.sql" healthcheck: test: [ "CMD-SHELL", "mysqladmin ping -h 127.0.0.1 -u root -p123456" ] interval: 5s timeout: 5s retries: 5 container_name: mysql - datagen: - build: ../datagen - depends_on: [message_queue] - command: - - /bin/sh - - -c - - /datagen --mode clickstream --qps 2 kafka --brokers message_queue:29092 - restart: always - container_name: datagen - prepare_mysql: - image: mysql:8.0 - depends_on: - - mysql - command: - - /bin/sh - - -c - - "mysql -p123456 -h mysql mydb < mysql_prepare.sql" - volumes: - - "./mysql_prepare.sql:/mysql_prepare.sql" - container_name: prepare_mysql - restart: on-failure volumes: risingwave-standalone: external: false diff --git a/integration_tests/mysql-sink/prepare.sh b/integration_tests/mysql-sink/prepare.sh new file mode 100755 index 0000000000000..9f2e93d1b40a5 --- /dev/null +++ b/integration_tests/mysql-sink/prepare.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -euo pipefail + +sleep 10 + +# setup mysql +docker compose exec mysql bash -c "mysql -p123456 -h mysql mydb < mysql_prepare.sql" diff --git a/integration_tests/mysql-sink/query.sql b/integration_tests/mysql-sink/query.sql deleted file mode 100644 index 6fbe4cc96813e..0000000000000 --- a/integration_tests/mysql-sink/query.sql +++ /dev/null @@ -1,13 +0,0 @@ -SELECT - * -FROM - target_count -LIMIT - 10; - -SELECT - * -FROM - data_types -LIMIT - 10; \ No newline at end of file diff --git a/integration_tests/mysql-sink/sink_check.py b/integration_tests/mysql-sink/sink_check.py new file mode 100644 index 0000000000000..b7cf590c38d09 --- /dev/null +++ b/integration_tests/mysql-sink/sink_check.py @@ -0,0 +1,23 @@ +import subprocess +import sys + +relations = ['target_count', 'data_types'] + +failed_cases = [] +for rel in relations: + sql = f'SELECT COUNT(*) FROM {rel};' + print(f"Running SQL: {sql} ON MYSQL") + command = f'mysql -p123456 mydb -e "{sql}"' + output = subprocess.check_output( + ["docker", "compose", "exec", "mysql", "bash", "-c", command]) + # output: + # COUNT(*) + # 0 + rows = int(output.decode('utf-8').split('\n')[1]) + print(f"{rows} rows in {rel}") + if rows < 1: + failed_cases.append(rel) + +if len(failed_cases) != 0: + print(f"Data check failed for case {failed_cases}") + sys.exit(1) diff --git a/integration_tests/postgres-sink/README.md b/integration_tests/postgres-sink/README.md index 3066b8d37e3f9..c0d4d6956aff9 100644 --- a/integration_tests/postgres-sink/README.md +++ b/integration_tests/postgres-sink/README.md @@ -5,9 +5,8 @@ This demo showcases how to sink RisingWave's data to an external Postgres. The d Here's what this demo does: 1. `docker compose up -d`: Start the cluster. -2. After 20-30s: `create_source.sql`. -3. After 10s: `create_mv.sql`. -4. After another 10s, the tester will check if the source has ingested some data by creating a materialized view upon the source. It also checks if the MV created in the 3rd step has some data. +2. After 20-30s: `create_source.sql`, `create_mv.sql`, `create_sink.sql`. +3. After another 30s, the tester will check if the ingestion is successful by `SELECT COUNT(*) FROM target_count;` in Postgres. To connect to the Postgres on your local PC: diff --git a/integration_tests/postgres-sink/create_mv.sql b/integration_tests/postgres-sink/create_mv.sql index ef403f4b507c5..2cba41795922f 100644 --- a/integration_tests/postgres-sink/create_mv.sql +++ b/integration_tests/postgres-sink/create_mv.sql @@ -6,45 +6,3 @@ FROM user_behaviors GROUP BY target_id; - -CREATE SINK target_count_postgres_sink -FROM - target_count WITH ( - connector = 'jdbc', - jdbc.url = 'jdbc:postgresql://postgres:5432/mydb?user=myuser&password=123456', - table.name = 'target_count', - type = 'upsert', - primary_key = 'target_id' - ); - --- ingest back to RW -CREATE table rw_typed_data ( - id BIGINT PRIMARY KEY, - varchar_column VARCHAR, - text_column TEXT, - integer_column INTEGER, - smallint_column SMALLINT, - bigint_column BIGINT, - decimal_column DECIMAL, - real_column REAL, - double_column DOUBLE PRECISION, - boolean_column BOOLEAN, - date_column DATE, - time_column TIME, - timestamp_column TIMESTAMP, - timestamptz_column TIMESTAMPTZ, - interval_column INTERVAL, - jsonb_column JSONB, - bytea_column BYTEA, - array_column VARCHAR[] -) WITH ( - connector = 'postgres-cdc', - hostname = 'postgres', - port = '5432', - username = 'myuser', - password = '123456', - database.name = 'mydb', - schema.name = 'public', - table.name = 'data_types', - slot.name = 'data_types' -); diff --git a/integration_tests/postgres-sink/create_sink.sql b/integration_tests/postgres-sink/create_sink.sql new file mode 100644 index 0000000000000..e01ad2760cc2e --- /dev/null +++ b/integration_tests/postgres-sink/create_sink.sql @@ -0,0 +1,20 @@ +CREATE SINK target_count_postgres_sink +FROM + target_count WITH ( + connector = 'jdbc', + jdbc.url = 'jdbc:postgresql://postgres:5432/mydb?user=myuser&password=123456', + table.name = 'target_count', + type = 'upsert', + primary_key = 'target_id' + ); + +-- sink data_type table to pg +CREATE SINK data_types_postgres_sink +FROM + data_types WITH ( + connector = 'jdbc', + jdbc.url = 'jdbc:postgresql://postgres:5432/mydb?user=myuser&password=123456', + table.name = 'data_types', + type='upsert', + primary_key = 'id' +); diff --git a/integration_tests/postgres-sink/create_source.sql b/integration_tests/postgres-sink/create_source.sql index 7303e350f32df..6840f8cb379c1 100644 --- a/integration_tests/postgres-sink/create_source.sql +++ b/integration_tests/postgres-sink/create_source.sql @@ -7,10 +7,11 @@ CREATE SOURCE user_behaviors ( parent_target_type VARCHAR, parent_target_id VARCHAR ) WITH ( - connector = 'kafka', - topic = 'user_behaviors', - properties.bootstrap.server = 'message_queue:29092', - scan.startup.mode = 'earliest' + connector = 'datagen', + fields.user_id.kind = 'sequence', + fields.user_id.start = 1, + fields.user_id.end = 100, + datagen.rows.per.second = '100' ) FORMAT PLAIN ENCODE JSON; CREATE TABLE data_types ( @@ -34,17 +35,6 @@ CREATE TABLE data_types ( array_column VARCHAR[] ); --- sink data_type table to pg -CREATE SINK data_types_postgres_sink -FROM - data_types WITH ( - connector = 'jdbc', - jdbc.url = 'jdbc:postgresql://postgres:5432/mydb?user=myuser&password=123456', - table.name = 'data_types', - type='upsert', - primary_key = 'id' -); - INSERT INTO data_types (id, varchar_column, text_column, integer_column, smallint_column, bigint_column, decimal_column, real_column, double_column, boolean_column, date_column, time_column, timestamp_column, timestamptz_column, interval_column, jsonb_column, bytea_column, array_column) VALUES (1, 'Varchar value 1', 'Text value 1', 123, 456, 789, 12.34, 56.78, 90.12, TRUE, '2023-05-22', '12:34:56', '2023-05-22 12:34:56', '2023-05-22 12:34:56+00:00', '1 day', '{"key": "value"}', E'\\xDEADBEEF', ARRAY['Value 1', 'Value 2']), @@ -52,4 +42,3 @@ VALUES (3, 'Varchar value 3', 'Text value 3', 345, 678, 901, 34.56, 78.90, 12.34, TRUE, '2023-05-24', '12:34:56', '2023-05-24 12:34:56', '2023-05-24 12:34:56+00:00', '3 days', '{"key": "value3"}', E'\\xCAFEBABE', ARRAY['Value 5', 'Value 6']), (4, 'Varchar value 4', 'Text value 4', 456, 789, 012, 45.67, 89.01, 23.45, FALSE, '2023-05-25', '23:45:01', '2023-05-25 23:45:01', '2023-05-25 23:45:01+00:00', '4 days', '{"key": "value4"}', E'\\xBABEC0DE', ARRAY['Value 7', 'Value 8']), (5, 'Varchar value 5', 'Text value 5', 567, 890, 123, 56.78, 90.12, 34.56, TRUE, '2023-05-26', '12:34:56', '2023-05-26 12:34:56', '2023-05-26 12:34:56+00:00', '5 days', '{"key": "value5"}', E'\\xDEADBABE', ARRAY['Value 9', 'Value 10']); - diff --git a/integration_tests/postgres-sink/data_check b/integration_tests/postgres-sink/data_check deleted file mode 100644 index 0f8b2d5166847..0000000000000 --- a/integration_tests/postgres-sink/data_check +++ /dev/null @@ -1 +0,0 @@ -user_behaviors,target_count,rw_typed_data \ No newline at end of file diff --git a/integration_tests/postgres-sink/docker-compose.yml b/integration_tests/postgres-sink/docker-compose.yml index a5b8465d7c915..e443965c2e5be 100644 --- a/integration_tests/postgres-sink/docker-compose.yml +++ b/integration_tests/postgres-sink/docker-compose.yml @@ -21,23 +21,10 @@ services: extends: file: ../../docker/docker-compose.yml service: prometheus-0 - message_queue: - extends: - file: ../../docker/docker-compose.yml - service: message_queue - datagen: - build: ../datagen - depends_on: [message_queue] - command: - - /bin/sh - - -c - - /datagen --mode clickstream --qps 2 kafka --brokers message_queue:29092 - restart: always - container_name: datagen # Use this command to connect to the DB from outside the container: # docker exec postgres psql --username=myuser --dbname=mydb postgres: - image: postgres + image: postgres:latest environment: - POSTGRES_USER=myuser - POSTGRES_PASSWORD=123456 @@ -50,20 +37,10 @@ services: timeout: 5s retries: 5 command: [ "postgres", "-c", "wal_level=logical" ] - restart: always - container_name: postgres - prepare_postgres: - image: postgres - depends_on: - - postgres - command: - - /bin/sh - - -c - - "psql postgresql://myuser:123456@postgres:5432/mydb < postgres_prepare.sql" volumes: - "./postgres_prepare.sql:/postgres_prepare.sql" - container_name: prepare_postgres - restart: on-failure + restart: always + container_name: postgres volumes: risingwave-standalone: external: false diff --git a/integration_tests/postgres-sink/prepare.sh b/integration_tests/postgres-sink/prepare.sh new file mode 100755 index 0000000000000..ab1f2ddb465fc --- /dev/null +++ b/integration_tests/postgres-sink/prepare.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +set -euo pipefail + +# setup postgres +docker compose exec postgres bash -c "psql postgresql://myuser:123456@postgres:5432/mydb < postgres_prepare.sql" diff --git a/integration_tests/postgres-sink/query.sql b/integration_tests/postgres-sink/query.sql deleted file mode 100644 index e09c66a255f10..0000000000000 --- a/integration_tests/postgres-sink/query.sql +++ /dev/null @@ -1,6 +0,0 @@ -SELECT - * -FROM - target_count -LIMIT - 10; \ No newline at end of file diff --git a/integration_tests/postgres-sink/sink_check.py b/integration_tests/postgres-sink/sink_check.py new file mode 100644 index 0000000000000..606b78424a262 --- /dev/null +++ b/integration_tests/postgres-sink/sink_check.py @@ -0,0 +1,21 @@ +import sys +import subprocess + + +relations = ['target_count', 'data_types'] + +failed_cases = [] +for rel in relations: + sql = f'SELECT COUNT(*) FROM {rel};' + print(f"Running SQL: {sql} ON PG") + command = f'psql -U $POSTGRES_USER $POSTGRES_DB --tuples-only -c "{sql}"' + rows = subprocess.check_output( + ["docker", "compose", "exec", "postgres", "bash", "-c", command]) + rows = int(rows.decode('utf-8').strip()) + print(f"{rows} rows in {rel}") + if rows < 1: + failed_cases.append(rel) + +if len(failed_cases) != 0: + print(f"Data check failed for case {failed_cases}") + sys.exit(1) diff --git a/integration_tests/redis-sink/sink_check b/integration_tests/redis-sink/sink_check deleted file mode 100644 index 80a05155b821f..0000000000000 --- a/integration_tests/redis-sink/sink_check +++ /dev/null @@ -1 +0,0 @@ -user_id,UserID,types_id,TYPESID diff --git a/integration_tests/redis-sink/sink_check.py b/integration_tests/redis-sink/sink_check.py new file mode 100644 index 0000000000000..24debe867a9bf --- /dev/null +++ b/integration_tests/redis-sink/sink_check.py @@ -0,0 +1,22 @@ +import sys +import subprocess + +relations = ['user_id', 'UserID', 'types_id', 'TYPESID'] + +failed_cases = [] +for rel in relations: + query = f"*{rel}*" + print(f"Running query: scan {query} on Redis") + output = subprocess.Popen(["docker", "compose", "exec", "redis", "redis-cli", "--scan", "--pattern", query], + stdout=subprocess.PIPE) + rows = subprocess.check_output(["wc", "-l"], stdin=output.stdout) + output.stdout.close() + output.wait() + rows = int(rows.decode('utf8').strip()) + print(f"{rows} keys in '*{rel}*'") + if rows < 1: + failed_cases.append(rel) + +if len(failed_cases) != 0: + print(f"Data check failed for case {failed_cases}") + sys.exit(1) diff --git a/integration_tests/scripts/check_data.py b/integration_tests/scripts/check_data.py index 0575615df8ef3..9c449d5bff43b 100644 --- a/integration_tests/scripts/check_data.py +++ b/integration_tests/scripts/check_data.py @@ -1,6 +1,7 @@ #!/usr/bin/python3 -# Every demo directory contains a 'data_check' file that lists the relations (either source or mv) +# Every sink demo directory contains a 'sink_check.py' file that used to check test, +# and other demo directory contains a 'data_check' file that lists the relations (either source or mv) # that are expected to have >=1 rows. This script runs the checks by creating a materialized view over it, # and verify the rows count in the view. @@ -52,35 +53,67 @@ def run_psql(sql): "-d", "dev", "-U", "root", "--tuples-only", "-c", sql]) +def data_check(data_check_file: str): + with open(data_check_file) as f: + relations = f.read().strip().split(",") + for rel in relations: + create_mv(rel) + time.sleep(20) + failed_cases = [] + for rel in relations: + if not check_mv(rel): + failed_cases.append(rel) + if len(failed_cases) != 0: + raise Exception("Data check failed for case {}".format(failed_cases)) + + +def sink_check(demo_dir: str, sink_check_file: str): + print("sink created. Wait for half min time for ingestion") + + # wait for half min ingestion + time.sleep(30) + subprocess.run(["python3", sink_check_file], cwd=demo_dir, check=True) + + +def cdc_check(cdc_check_file: str, upstream: str): + with open(cdc_check_file) as f: + print("Check cdc table with upstream {}".format(upstream)) + relations = f.read().strip().split(",") + for rel in relations: + check_cdc_table(rel) + + +def test_check(demo: str, upstream: str, need_data_check=True, need_sink_check=False): + file_dir = dirname(abspath(__file__)) + project_dir = dirname(file_dir) + demo_dir = os.path.join(project_dir, demo) + + data_check_file = os.path.join(demo_dir, 'data_check') + if need_data_check or os.path.exists(data_check_file): + data_check(data_check_file) + else: + print(f"skip data check for {demo}") + + sink_check_file = os.path.join(demo_dir, 'sink_check.py') + if need_sink_check or os.path.exists(sink_check_file): + sink_check(demo_dir, sink_check_file) + else: + print(f"skip sink check for {demo}") + + cdc_check_file = os.path.join(demo_dir, 'cdc_check') + if os.path.exists(cdc_check_file): + cdc_check(cdc_check_file, upstream) + else: + print(f"skip cdc check for {demo}") + + demo = sys.argv[1] upstream = sys.argv[2] # mysql, postgres, etc. see scripts/integration_tests.sh -if demo in ['docker', 'iceberg-sink','clickhouse-sink', 'iceberg-cdc', 'kafka-cdc-sink', 'cassandra-and-scylladb-sink', 'elasticsearch-sink', 'redis-sink', 'big-query-sink']: +if demo in ['docker', 'iceberg-cdc']: print('Skip for running test for `%s`' % demo) sys.exit(0) -file_dir = dirname(abspath(__file__)) -project_dir = dirname(file_dir) -demo_dir = os.path.join(project_dir, demo) -data_check_file = os.path.join(demo_dir, 'data_check') -with open(data_check_file) as f: - relations = f.read().strip().split(",") - for rel in relations: - create_mv(rel) - time.sleep(20) - failed_cases = [] - for rel in relations: - if not check_mv(rel): - failed_cases.append(rel) - if len(failed_cases) != 0: - raise Exception("Data check failed for case {}".format(failed_cases)) - -cdc_check_file = os.path.join(demo_dir, 'cdc_check') -if not os.path.exists(cdc_check_file): - print("Skip cdc check for {}".format(demo)) - sys.exit(0) - -with open(cdc_check_file) as f: - print("Check cdc table with upstream {}".format(upstream)) - relations = f.read().strip().split(",") - for rel in relations: - check_cdc_table(rel) +if 'sink' in demo: + test_check(demo, upstream, need_data_check=False, need_sink_check=True) +else: + test_check(demo, upstream, need_data_check=True, need_sink_check=False) diff --git a/integration_tests/scripts/run_demos.py b/integration_tests/scripts/run_demos.py index c43c4a4cc1556..87967daa50b2c 100644 --- a/integration_tests/scripts/run_demos.py +++ b/integration_tests/scripts/run_demos.py @@ -6,7 +6,6 @@ import subprocess from time import sleep import argparse -import json def run_sql_file(f: str, dir: str): @@ -20,16 +19,7 @@ def run_sql_file(f: str, dir: str): sys.exit(1) -def run_bash_file(f: str, dir: str): - print("Running Bash file: {}".format(f)) - # ON_ERROR_STOP=1 will let psql return error code when the query fails. - # https://stackoverflow.com/questions/37072245/check-return-status-of-psql-command-in-unix-shell-scripting - proc = subprocess.run(["bash", f], check=True, cwd=dir) - if proc.returncode != 0: - sys.exit(1) - - -def run_demo(demo: str, format: str, wait_time = 40): +def run_demo(demo: str, format: str, wait_time=40): file_dir = dirname(abspath(__file__)) project_dir = dirname(file_dir) demo_dir = os.path.join(project_dir, demo) @@ -38,7 +28,11 @@ def run_demo(demo: str, format: str, wait_time = 40): subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True) sleep(wait_time) - sql_files = ['create_source.sql', 'create_mv.sql', 'query.sql'] + prepare_file = 'prepare.sh' + if os.path.exists(os.path.join(demo_dir, prepare_file)): + subprocess.run(["bash", prepare_file], cwd=demo_dir, check=True) + + sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql', 'query.sql'] for fname in sql_files: if format == 'protobuf': sql_file = os.path.join(demo_dir, "pb", fname) @@ -53,50 +47,7 @@ def run_demo(demo: str, format: str, wait_time = 40): continue run_sql_file(sql_file, demo_dir) sleep(10) - # Run query_sink.sh if it exists. - query_sink_file = os.path.join(demo_dir, 'query_sink.sh') - if os.path.isfile(query_sink_file): - run_bash_file(query_sink_file, demo_dir) - - -def run_kafka_cdc_demo(): - demo = "kafka-cdc-sink" - file_dir = dirname(abspath(__file__)) - project_dir = dirname(file_dir) - demo_dir = os.path.join(project_dir, demo) - print("Running demo: kafka-cdc-sink") - - subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True) - sleep(40) - - subprocess.run(["bash","./prepare.sh"], cwd=demo_dir, check=True) - sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql'] - for fname in sql_files: - sql_file = os.path.join(demo_dir, fname) - print("executing sql: ", open(sql_file).read()) - run_sql_file(sql_file, demo_dir) - - print("sink created. Wait for 2 min time for ingestion") - - # wait for two minutes ingestion - sleep(120) - - pg_check_file = os.path.join(demo_dir, 'pg_check') - with open(pg_check_file) as f: - relations = f.read().strip().split(",") - failed_cases = [] - for rel in relations: - sql = 'SELECT COUNT(*) FROM {}'.format(rel) - print("Running SQL: {} on PG".format(sql)) - command = 'psql -U $POSTGRES_USER $POSTGRES_DB --tuples-only -c "{}"'.format(sql) - rows = subprocess.check_output(["docker", "exec", "postgres", "bash", "-c", command]) - rows = int(rows.decode('utf8').strip()) - print("{} rows in {}".format(rows, rel)) - if rows < 1: - failed_cases.append(rel) - if len(failed_cases) != 0: - raise Exception("Data check failed for case {}".format(failed_cases)) def iceberg_cdc_demo(): demo = "iceberg-cdc" @@ -106,263 +57,6 @@ def iceberg_cdc_demo(): print("Running demo: iceberg-cdc") subprocess.run(["bash","./run_test.sh"], cwd=demo_dir, check=True) -def run_iceberg_demo(): - demo = "iceberg-sink" - file_dir = dirname(abspath(__file__)) - project_dir = dirname(file_dir) - demo_dir = os.path.join(project_dir, demo) - print("Running demo: iceberg-sink") - - subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True) - sleep(40) - - subprocess.run(["docker", "compose", "exec", "spark", "bash", "/spark-script/run-sql-file.sh", "create-table"], - cwd=demo_dir, check=True) - - sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql'] - for fname in sql_files: - sql_file = os.path.join(demo_dir, fname) - print("executing sql: ", open(sql_file).read()) - run_sql_file(sql_file, demo_dir) - sleep(10) - - print("sink created. Wait for 2 min time for ingestion") - - # wait for two minutes ingestion - sleep(120) - - query_sql = open(os.path.join(demo_dir, "iceberg-query.sql")).read() - - print("querying iceberg with presto sql: %s" % query_sql) - - query_output_file_name = "query_outout.txt" - - query_output_file = open(query_output_file_name, "wb") - - subprocess.run(["docker", "compose", "exec", "presto", "presto-cli", "--server", "localhost:8080", "--execute", query_sql], - cwd=demo_dir, check=True, stdout=query_output_file) - query_output_file.close() - - output_content = open(query_output_file_name).read() - - print(output_content) - - assert len(output_content.strip()) > 0 - -def run_clickhouse_demo(): - demo = "clickhouse-sink" - file_dir = dirname(abspath(__file__)) - project_dir = dirname(file_dir) - demo_dir = os.path.join(project_dir, demo) - print("Running demo: clickhouse-sink") - - subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True) - sleep(40) - - - subprocess.run(["docker", "compose", "exec", "clickhouse-server", "bash", "/opt/clickhouse/clickhouse-sql/run-sql-file.sh", "create_clickhouse_table"], - cwd=demo_dir, check=True) - - sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql'] - for fname in sql_files: - sql_file = os.path.join(demo_dir, fname) - print("executing sql: ", open(sql_file).read()) - run_sql_file(sql_file, demo_dir) - sleep(10) - - print("sink created. Wait for 2 min time for ingestion") - - # wait for two minutes ingestion - sleep(120) - - query_output_file_name = "query_outout.txt" - - query_output_file = open(query_output_file_name, "wb") - - subprocess.run(["docker", "compose", "exec", "clickhouse-server", "bash", "/opt/clickhouse/clickhouse-sql/run-sql-file.sh", "clickhouse_query"], - cwd=demo_dir, check=True, stdout=query_output_file) - query_output_file.close() - - output_content = open(query_output_file_name).read() - - print(output_content) - - assert len(output_content.strip()) > 0 - -def run_cassandra_and_scylladb_sink_demo(): - demo = "cassandra-and-scylladb-sink" - file_dir = dirname(abspath(__file__)) - project_dir = dirname(file_dir) - demo_dir = os.path.join(project_dir, demo) - print("Running demo: {}".format(demo)) - - subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True) - print("wait two min for cassandra and scylladb to start up") - sleep(120) - - dbs = ['cassandra', 'scylladb'] - for db in dbs: - subprocess.run(["docker", "compose", "exec", db, "cqlsh", "-f", "prepare_cassandra_and_scylladb.sql"], cwd=demo_dir, check=True) - - sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql'] - for fname in sql_files: - sql_file = os.path.join(demo_dir, fname) - print("executing sql: ", open(sql_file).read()) - run_sql_file(sql_file, demo_dir) - - print("sink created. Wait for 1 min time for ingestion") - - # wait for one minutes ingestion - sleep(60) - - sink_check_file = os.path.join(demo_dir, 'sink_check') - with open(sink_check_file) as f: - relations = f.read().strip().split(",") - failed_cases = [] - for rel in relations: - sql = 'select count(*) from {};'.format(rel) - for db in dbs: - print("Running SQL: {} on {}".format(sql, db)) - query_output_file_name = os.path.join(demo_dir, "query_{}_outout.txt".format(db)) - query_output_file = open(query_output_file_name, "wb+") - - command = "docker compose exec scylladb cqlsh -e" - subprocess.run(["docker", "compose", "exec", db, "cqlsh", "-e", sql], cwd=demo_dir, check=True, stdout=query_output_file) - - # output file: - # - # count - # ------- - # 1000 - # - # (1 rows) - query_output_file.seek(0) - lines = query_output_file.readlines() - query_output_file.close() - assert len(lines) >= 6 - assert lines[1].decode('utf-8').strip().lower() == 'count' - rows = int(lines[3].decode('utf-8').strip()) - print("{} rows in {}.{}".format(rows, db, rel)) - if rows < 1: - failed_cases.append(db + "_" + rel) - if len(failed_cases) != 0: - raise Exception("Data check failed for case {}".format(failed_cases)) - -def run_elasticsearch_sink_demo(): - demo = "elasticsearch-sink" - file_dir = dirname(abspath(__file__)) - project_dir = dirname(file_dir) - demo_dir = os.path.join(project_dir, demo) - print("Running demo: {}".format(demo)) - - subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True) - sleep(60) - - sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql'] - for fname in sql_files: - sql_file = os.path.join(demo_dir, fname) - print("executing sql: ", open(sql_file).read()) - run_sql_file(sql_file, demo_dir) - - print("sink created. Wait for half min time for ingestion") - - # wait for half min ingestion - sleep(30) - - versions = ['7', '8'] - sink_check_file = os.path.join(demo_dir, 'sink_check') - with open(sink_check_file) as f: - relations = f.read().strip().split(",") - failed_cases = [] - for rel in relations: - query = 'curl -XGET -u elastic:risingwave "http://localhost:9200/{}/_count" -H "Content-Type: application/json"'.format(rel) - for v in versions: - es = 'elasticsearch{}'.format(v) - print("Running Query: {} on {}".format(query, es)) - counts = subprocess.check_output(["docker", "compose", "exec", es, "bash", "-c", query], cwd=demo_dir) - counts = json.loads(counts)['count'] - print("{} counts in {}_{}".format(counts, es, rel)) - if counts < 1: - failed_cases.append(es + '_' + rel) - if len(failed_cases) != 0: - raise Exception("Data check failed for case {}".format(failed_cases)) - -def run_redis_demo(): - demo = "redis-sink" - file_dir = dirname(abspath(__file__)) - project_dir = dirname(file_dir) - demo_dir = os.path.join(project_dir, demo) - print("Running demo: {}".format(demo)) - - subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True) - sleep(40) - - sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql'] - for fname in sql_files: - sql_file = os.path.join(demo_dir, fname) - print("executing sql: ", open(sql_file).read()) - run_sql_file(sql_file, demo_dir) - - sleep(40) - sink_check_file = os.path.join(demo_dir, 'sink_check') - with open(sink_check_file) as f: - relations = f.read().strip().split(",") - failed_cases = [] - for rel in relations: - query = "*{}*".format(rel) - print("Running query: scan on Redis".format(query)) - output = subprocess.Popen(["docker", "compose", "exec", "redis", "redis-cli", "--scan", "--pattern", query], cwd=demo_dir, stdout=subprocess.PIPE) - rows = subprocess.check_output(["wc", "-l"], cwd=demo_dir, stdin=output.stdout) - output.stdout.close() - output.wait() - rows = int(rows.decode('utf8').strip()) - print("{} keys in '*{}*'".format(rows, rel)) - if rows < 1: - failed_cases.append(rel) - if len(failed_cases) != 0: - raise Exception("Data check failed for case {}".format(failed_cases)) - -def run_bigquery_demo(): - demo = "big-query-sink" - file_dir = dirname(abspath(__file__)) - project_dir = dirname(file_dir) - demo_dir = os.path.join(project_dir, demo) - print("Running demo: {}".format(demo)) - - subprocess.run(["docker", "compose", "up", "-d", "--build"], cwd=demo_dir, check=True) - subprocess.run(["docker", "compose", "exec", "gcloud-cli", "gcloud", "auth", "login", "--cred-file=/gcp-rwctest.json"], cwd=demo_dir, check=True) - subprocess.run(["docker", "compose", "exec", "gcloud-cli", "gcloud", "config", "set", "project", "rwctest"], cwd=demo_dir, check=True) - - bq_prepare_file = os.path.join(demo_dir, 'bq_prepare.sql') - bq_prepare_content = open(bq_prepare_file).read().strip() - subprocess.run(["docker", "compose", "exec", "gcloud-cli", "bq", "query", "--use_legacy_sql=false", bq_prepare_content], cwd=demo_dir, check=True) - sleep(30) - - sql_files = ['create_source.sql', 'create_mv.sql', 'create_sink.sql'] - for fname in sql_files: - sql_file = os.path.join(demo_dir, "append-only-sql/"+fname) - print("executing sql: ", open(sql_file).read()) - run_sql_file(sql_file, demo_dir) - - sleep(30) - sink_check_file = os.path.join(demo_dir, 'sink_check') - with open(sink_check_file) as f: - relations = f.read().strip().split(",") - failed_cases = [] - for rel in relations: - sql = "SELECT COUNT(*) AS count FROM `{}`".format(rel) - print("run sql {} on Bigquery".format(sql)) - rows = subprocess.check_output(["docker", "compose", "exec", "gcloud-cli", "bq", "query", "--use_legacy_sql=false", "--format=json", sql], cwd=demo_dir) - rows = int(json.loads(rows.decode("utf-8").strip())[0]['count']) - print("{} rows in {}".format(rows, rel)) - if rows < 1: - failed_cases.append(rel) - - drop_sql = "DROP TABLE IF EXISTS `{}`".format(rel) - subprocess.run(["docker", "compose", "exec", "gcloud-cli", "bq", "query", "--use_legacy_sql=false", drop_sql], cwd=demo_dir, check=True) - - if len(failed_cases) != 0: - raise Exception("Data check failed for case {}".format(failed_cases)) arg_parser = argparse.ArgumentParser(description="Run the demo") arg_parser.add_argument( @@ -379,24 +73,7 @@ def run_bigquery_demo(): # disable telemetry in env os.environ['ENABLE_TELEMETRY'] = "false" -if args.case == "iceberg-sink": - if args.format == "protobuf": - print("skip protobuf test for iceberg-sink") - else: - run_iceberg_demo() -elif args.case == "clickhouse-sink": - run_clickhouse_demo() -elif args.case == "iceberg-cdc": +if args.case == "iceberg-cdc": iceberg_cdc_demo() -elif args.case == "kafka-cdc-sink": - run_kafka_cdc_demo() -elif args.case == "cassandra-and-scylladb-sink": - run_cassandra_and_scylladb_sink_demo() -elif args.case == "elasticsearch-sink": - run_elasticsearch_sink_demo() -elif args.case == "redis-sink": - run_redis_demo() -elif args.case == "big-query-sink": - run_bigquery_demo() else: run_demo(args.case, args.format) diff --git a/integration_tests/tidb-cdc-sink/create_mv.sql b/integration_tests/tidb-cdc-sink/create_mv.sql index 242c42846bd5a..95aef4c4883f3 100644 --- a/integration_tests/tidb-cdc-sink/create_mv.sql +++ b/integration_tests/tidb-cdc-sink/create_mv.sql @@ -30,12 +30,3 @@ FROM datatype GROUP BY c0_boolean; - -CREATE SINK hot_hashtags_sink FROM hot_hashtags -WITH ( - connector='jdbc', - jdbc.url='jdbc:mysql://tidb:4000/test?user=root&password=', - table.name='hot_hashtags', - type='upsert', - primary_key='window_start,hashtag' -); diff --git a/integration_tests/tidb-cdc-sink/create_sink.sql b/integration_tests/tidb-cdc-sink/create_sink.sql new file mode 100644 index 0000000000000..7c7726ad8120f --- /dev/null +++ b/integration_tests/tidb-cdc-sink/create_sink.sql @@ -0,0 +1,8 @@ +CREATE SINK hot_hashtags_sink FROM hot_hashtags +WITH ( + connector='jdbc', + jdbc.url='jdbc:mysql://tidb:4000/test?user=root&password=', + table.name='hot_hashtags', + type='upsert', + primary_key='window_start,hashtag' +); diff --git a/integration_tests/tidb-cdc-sink/docker-compose.yml b/integration_tests/tidb-cdc-sink/docker-compose.yml index 70481ab6dbb5c..5d756ba15ffff 100644 --- a/integration_tests/tidb-cdc-sink/docker-compose.yml +++ b/integration_tests/tidb-cdc-sink/docker-compose.yml @@ -190,20 +190,13 @@ services: restart: always container_name: datagen - init_tidb: - image: mysql:8.0 - depends_on: - - tidb - command: - - /bin/sh - - -c - - "mysql --password= -h tidb --port 4000 -u root test < tidb_create_tables.sql && - sleep 10 && - mysql --password= -h tidb --port 4000 -u root test < tidb_prepare.sql" + mysql: + image: mysql:latest + command: tail -f /dev/null volumes: - "./tidb_create_tables.sql:/tidb_create_tables.sql" - "./tidb_prepare.sql:/tidb_prepare.sql" - container_name: init_tidb + container_name: mysql restart: on-failure volumes: @@ -219,3 +212,4 @@ volumes: external: false message_queue: external: false +name: risingwave-compose diff --git a/integration_tests/tidb-cdc-sink/prepare.sh b/integration_tests/tidb-cdc-sink/prepare.sh new file mode 100755 index 0000000000000..e60363e232e06 --- /dev/null +++ b/integration_tests/tidb-cdc-sink/prepare.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -euo pipefail + +docker compose exec mysql bash -c "mysql --password= -h tidb --port 4000 -u root test < tidb_create_tables.sql" + +docker compose exec mysql bash -c "mysql --password= -h tidb --port 4000 -u root test < tidb_prepare.sql" + +sleep 15 diff --git a/integration_tests/tidb-cdc-sink/sink_check.py b/integration_tests/tidb-cdc-sink/sink_check.py new file mode 100644 index 0000000000000..19185d950cc01 --- /dev/null +++ b/integration_tests/tidb-cdc-sink/sink_check.py @@ -0,0 +1,22 @@ +import sys +import subprocess + +relations = ['hot_hashtags'] + +failed_cases = [] +for rel in relations: + sql = f'SELECT COUNT(*) FROM {rel};' + command = f'mysql --password= -h tidb --port 4000 -u root test -e "{sql}"' + output = subprocess.check_output( + ["docker", "compose", "exec", "mysql", "bash", "-c", command]) + # output: + # COUNT(*) + # 0 + rows = int(output.decode('utf-8').split('\n')[1]) + print(f"{rows} rows in {rel}") + if rows < 1: + failed_cases.append(rel) + +if len(failed_cases) != 0: + print(f"Data check failed for case {failed_cases}") + sys.exit(1) diff --git a/proto/hummock.proto b/proto/hummock.proto index 3d3a831c1c06a..df582cc491ae3 100644 --- a/proto/hummock.proto +++ b/proto/hummock.proto @@ -116,6 +116,11 @@ message VnodeWatermark { common.Buffer vnode_bitmap = 2; } +// Table watermark is a lighter weight range delete introduced in +// https://github.com/risingwavelabs/risingwave/issues/13148 +// It means the lowest (or highest when `is_ascending` is false) visible +// keys in the table within a vnode. Keys lower (or higher) than the +// table watermark is invisible and will be cleaned in later compaction. message TableWatermarks { message EpochNewWatermarks { repeated VnodeWatermark watermarks = 1; @@ -145,7 +150,7 @@ message HummockVersion { // Snapshots with epoch less than the safe epoch have been GCed. // Reads against such an epoch will fail. uint64 safe_epoch = 4; - map table_watermarks = 5; + map table_watermarks = 5; } message HummockVersionDelta { @@ -162,7 +167,7 @@ message HummockVersionDelta { uint64 safe_epoch = 5; bool trivial_move = 6; repeated uint64 gc_object_ids = 7; - map new_table_watermarks = 8; + map new_table_watermarks = 8; } message HummockVersionDeltas { @@ -337,6 +342,9 @@ message CompactTask { // Deprecated. use table_vnode_partition instead; uint32 split_weight_by_vnode = 22 [deprecated = true]; map table_vnode_partition = 23; + // The table watermark of any table id. In compaction we only use the table watermarks on safe epoch, + // so we only need to include the table watermarks on safe epoch to reduce the size of metadata. + map table_watermarks = 24; } message LevelHandler { diff --git a/proto/stream_plan.proto b/proto/stream_plan.proto index c97a0f2d8406e..65b3a10db8b68 100644 --- a/proto/stream_plan.proto +++ b/proto/stream_plan.proto @@ -495,6 +495,9 @@ enum StreamScanType { // ChainExecutor with upstream_only = true STREAM_SCAN_TYPE_UPSTREAM_ONLY = 4; + + // ArrangementBackfillExecutor + STREAM_SCAN_TYPE_ARRANGEMENT_BACKFILL = 5; } // StreamScanNode reads data from upstream table first, and then pass all events to downstream. @@ -524,6 +527,7 @@ message StreamScanNode { catalog.Table state_table = 5; // The upstream materialized view info used by backfill. + // Used iff `ChainType::Backfill`. plan_common.StorageTableDesc table_desc = 7; // The rate limit for the stream scan node. @@ -531,6 +535,10 @@ message StreamScanNode { // Snapshot read every N barriers uint32 snapshot_read_barrier_interval = 9 [deprecated = true]; + + // The state table used by ArrangementBackfill to replicate upstream mview's state table. + // Used iff `ChainType::ArrangementBackfill`. + catalog.Table arrangement_table = 10; } message StreamCdcScanNode { diff --git a/proto/stream_service.proto b/proto/stream_service.proto index 5d82452dc0ca9..462f5ff0256a6 100644 --- a/proto/stream_service.proto +++ b/proto/stream_service.proto @@ -87,7 +87,7 @@ message BarrierCompleteResponse { } repeated GroupedSstableInfo synced_sstables = 4; uint32 worker_id = 5; - map table_watermarks = 6; + map table_watermarks = 6; } // Before starting streaming, the leader node broadcast the actor-host table to needed workers. diff --git a/src/common/src/array/data_chunk.rs b/src/common/src/array/data_chunk.rs index fff5efc22d1f8..90c2560cadcb2 100644 --- a/src/common/src/array/data_chunk.rs +++ b/src/common/src/array/data_chunk.rs @@ -34,8 +34,9 @@ use crate::field_generator::{FieldGeneratorImpl, VarcharProperty}; use crate::hash::HashCode; use crate::row::Row; use crate::types::{DataType, DatumRef, StructType, ToOwnedDatum, ToText}; +use crate::util::chunk_coalesce::DataChunkBuilder; use crate::util::hash_util::finalize_hashers; -use crate::util::iter_util::{ZipEqDebug, ZipEqFast}; +use crate::util::iter_util::ZipEqFast; use crate::util::value_encoding::{ estimate_serialize_datum_size, serialize_datum_into, try_get_exact_serialize_datum_size, ValueRowSerializer, @@ -95,23 +96,24 @@ impl DataChunk { } /// Build a `DataChunk` with rows. + /// + /// Panics if the `rows` is empty. + /// + /// Should prefer using [`DataChunkBuilder`] instead to avoid unnecessary allocation + /// of rows. pub fn from_rows(rows: &[impl Row], data_types: &[DataType]) -> Self { - let mut array_builders = data_types - .iter() - .map(|data_type| data_type.create_array_builder(1)) - .collect::>(); + // `append_one_row` will cause the builder to finish immediately once capacity is met. + // Hence, we allocate an extra row here, to avoid the builder finishing prematurely. + // This just makes the code cleaner, since we can loop through all rows, and consume it finally. + // TODO: introduce `new_unlimited` to decouple memory reservation from builder capacity. + let mut builder = DataChunkBuilder::new(data_types.to_vec(), rows.len() + 1); for row in rows { - for (datum, builder) in row.iter().zip_eq_debug(array_builders.iter_mut()) { - builder.append(datum); - } + let none = builder.append_one_row(row); + debug_assert!(none.is_none()); } - let new_columns = array_builders - .into_iter() - .map(|builder| builder.finish().into()) - .collect::>(); - DataChunk::new(new_columns, rows.len()) + builder.consume_all().expect("chunk should not be empty") } /// Return the next visible row index on or after `row_idx`. @@ -322,83 +324,24 @@ impl DataChunk { /// `rechunk` creates a new vector of data chunk whose size is `each_size_limit`. /// When the total cardinality of all the chunks is not evenly divided by the `each_size_limit`, /// the last new chunk will be the remainder. - /// - /// Currently, `rechunk` would ignore visibility map. May or may not support it later depending - /// on the demand pub fn rechunk(chunks: &[DataChunk], each_size_limit: usize) -> ArrayResult> { - assert!(each_size_limit > 0); - // Corner case: one of the `chunks` may have 0 length - // remove the chunks with zero physical length here, - // or skip them in the loop below - let chunks = chunks - .iter() - .filter(|chunk| chunk.capacity() != 0) - .collect::>(); - if chunks.is_empty() { + let Some(data_types) = chunks.first().map(|c| c.data_types()) else { return Ok(Vec::new()); - } + }; - let mut total_capacity = chunks.iter().map(|chunk| chunk.capacity()).sum(); - let num_chunks = (total_capacity + each_size_limit - 1) / each_size_limit; + let mut builder = DataChunkBuilder::new(data_types, each_size_limit); + let mut outputs = Vec::new(); - // the idx of `chunks` - let mut chunk_idx = 0; - // the row idx of `chunks[chunk_idx]` - let mut start_row_idx = 0; - // how many rows does this new chunk need? - let mut new_chunk_require = std::cmp::min(total_capacity, each_size_limit); - let mut array_builders: Vec = chunks[0] - .columns - .iter() - .map(|col| col.create_builder(new_chunk_require)) - .collect(); - let mut array_len = new_chunk_require; - let mut new_chunks = Vec::with_capacity(num_chunks); - while chunk_idx < chunks.len() { - let capacity = chunks[chunk_idx].capacity(); - let num_rows_left = capacity - start_row_idx; - let actual_acquire = std::cmp::min(new_chunk_require, num_rows_left); - let end_row_idx = start_row_idx + actual_acquire - 1; - array_builders - .iter_mut() - .zip_eq_fast(chunks[chunk_idx].columns()) - .for_each(|(builder, column)| { - let mut array_builder = column.create_builder(end_row_idx - start_row_idx + 1); - for row_idx in start_row_idx..=end_row_idx { - array_builder.append(column.value_at(row_idx)); - } - builder.append_array(&array_builder.finish()); - }); - // since `end_row_idx` is inclusive, exclude it for the next round. - start_row_idx = end_row_idx + 1; - // if the current `chunks[chunk_idx] is used up, move to the next one - if start_row_idx == capacity { - chunk_idx += 1; - start_row_idx = 0; - } - new_chunk_require -= actual_acquire; - total_capacity -= actual_acquire; - // a new chunk receives enough rows, finalize it - if new_chunk_require == 0 { - let new_columns: Vec = array_builders - .drain(..) - .map(|builder| builder.finish().into()) - .collect(); - - array_builders = new_columns - .iter() - .map(|col_type| col_type.create_builder(new_chunk_require)) - .collect(); - - let data_chunk = DataChunk::new(new_columns, array_len); - new_chunks.push(data_chunk); - - new_chunk_require = std::cmp::min(total_capacity, each_size_limit); - array_len = new_chunk_require; + for chunk in chunks { + for output in builder.append_chunk(chunk.clone()) { + outputs.push(output); } } + if let Some(output) = builder.consume_all() { + outputs.push(output); + } - Ok(new_chunks) + Ok(outputs) } /// Compute hash values for each row. diff --git a/src/common/src/array/mod.rs b/src/common/src/array/mod.rs index 086f7ffd5cc9d..80d84e5245d2d 100644 --- a/src/common/src/array/mod.rs +++ b/src/common/src/array/mod.rs @@ -32,6 +32,7 @@ mod num256_array; mod primitive_array; mod proto_reader; pub mod stream_chunk; +pub mod stream_chunk_builder; mod stream_chunk_iter; pub mod stream_record; pub mod struct_array; diff --git a/src/common/src/array/stream_chunk.rs b/src/common/src/array/stream_chunk.rs index e024d22ec5172..192d4adfe7d3e 100644 --- a/src/common/src/array/stream_chunk.rs +++ b/src/common/src/array/stream_chunk.rs @@ -24,6 +24,7 @@ use rand::prelude::SmallRng; use rand::{Rng, SeedableRng}; use risingwave_pb::data::{PbOp, PbStreamChunk}; +use super::stream_chunk_builder::StreamChunkBuilder; use super::{ArrayImpl, ArrayRef, ArrayResult, DataChunkTestExt, RowRef}; use crate::array::DataChunk; use crate::buffer::{Bitmap, BitmapBuilder}; @@ -32,7 +33,7 @@ use crate::estimate_size::EstimateSize; use crate::field_generator::VarcharProperty; use crate::row::Row; use crate::types::{DataType, DefaultOrdered, ToText}; -use crate::util::iter_util::ZipEqDebug; + /// `Op` represents three operations in `StreamChunk`. /// /// `UpdateDelete` and `UpdateInsert` are semantically equivalent to `Delete` and `Insert` @@ -125,26 +126,24 @@ impl StreamChunk { } /// Build a `StreamChunk` from rows. - // TODO: introducing something like `StreamChunkBuilder` maybe better. + /// + /// Panics if the `rows` is empty. + /// + /// Should prefer using [`StreamChunkBuilder`] instead to avoid unnecessary + /// allocation of rows. pub fn from_rows(rows: &[(Op, impl Row)], data_types: &[DataType]) -> Self { - let mut array_builders = data_types - .iter() - .map(|data_type| data_type.create_array_builder(rows.len())) - .collect::>(); - let mut ops = vec![]; + // `append_row` will cause the builder to finish immediately once capacity is met. + // Hence, we allocate an extra row here, to avoid the builder finishing prematurely. + // This just makes the code cleaner, since we can loop through all rows, and consume it finally. + // TODO: introduce `new_unlimited` to decouple memory reservation from builder capacity. + let mut builder = StreamChunkBuilder::new(rows.len() + 1, data_types.to_vec()); for (op, row) in rows { - ops.push(*op); - for (datum, builder) in row.iter().zip_eq_debug(array_builders.iter_mut()) { - builder.append(datum); - } + let none = builder.append_row(*op, row); + debug_assert!(none.is_none()); } - let new_columns = array_builders - .into_iter() - .map(|builder| builder.finish().into()) - .collect::>(); - StreamChunk::new(ops, new_columns) + builder.take().expect("chunk should not be empty") } /// Get the reference of the underlying data chunk. @@ -182,33 +181,20 @@ impl StreamChunk { /// For consecutive `UpdateDelete` and `UpdateInsert`, they will be kept in one chunk. /// As a result, some chunks may have `size + 1` rows. pub fn split(&self, size: usize) -> Vec { - let data_types = self.data_types(); - let mut rows = Vec::with_capacity(size + 1); - let mut results = vec![]; - - let mut iter = self.rows(); - while let Some(row) = iter.next() { - rows.push(row); - if rows.len() == size { - // If the last row is UpdateDelete, also include the UpdateInsert. - if rows.last().unwrap().0 == Op::UpdateDelete { - let next_row = iter - .next() - .expect("UpdateDelete should have UpdateInsert after"); - assert_eq!(next_row.0, Op::UpdateInsert); - rows.push(next_row); - } - let chunk = Self::from_rows(&rows, &data_types); - results.push(chunk); - rows.clear(); + let mut builder = StreamChunkBuilder::new(size, self.data_types()); + let mut outputs = Vec::new(); + + // TODO: directly append the chunk. + for (op, row) in self.rows() { + if let Some(chunk) = builder.append_row(op, row) { + outputs.push(chunk); } } - - if !rows.is_empty() { - let chunk = StreamChunk::from_rows(&rows, &data_types); - results.push(chunk); + if let Some(output) = builder.take() { + outputs.push(output); } - results + + outputs } pub fn into_parts(self) -> (DataChunk, Arc<[Op]>) { @@ -534,38 +520,10 @@ impl StreamChunkMut { } } } -/// Test utilities for [`StreamChunk`]. -pub trait StreamChunkTestExt: Sized { - fn from_pretty(s: &str) -> Self; - - /// Validate the `StreamChunk` layout. - fn valid(&self) -> bool; - - /// Concatenate multiple `StreamChunk` into one. - fn concat(chunks: Vec) -> Self; - - /// Sort rows. - fn sort_rows(self) -> Self; - - /// Generate stream chunks - fn gen_stream_chunks( - num_of_chunks: usize, - chunk_size: usize, - data_types: &[DataType], - varchar_properties: &VarcharProperty, - ) -> Vec; - - fn gen_stream_chunks_inner( - num_of_chunks: usize, - chunk_size: usize, - data_types: &[DataType], - varchar_properties: &VarcharProperty, - visibility_percent: f64, // % of rows that are visible - inserts_percent: f64, - ) -> Vec; -} -impl StreamChunkTestExt for StreamChunk { +/// Test utilities for [`StreamChunk`]. +#[easy_ext::ext(StreamChunkTestExt)] +impl StreamChunk { /// Parse a chunk from string. /// /// See also [`DataChunkTestExt::from_pretty`]. @@ -606,7 +564,7 @@ impl StreamChunkTestExt for StreamChunk { /// // x[]: array of x /// // : struct /// ``` - fn from_pretty(s: &str) -> Self { + pub fn from_pretty(s: &str) -> Self { let mut chunk_str = String::new(); let mut ops = vec![]; @@ -647,34 +605,39 @@ impl StreamChunkTestExt for StreamChunk { } } - fn valid(&self) -> bool { + /// Validate the `StreamChunk` layout. + pub fn valid(&self) -> bool { let len = self.ops.len(); let data = &self.data; data.visibility().len() == len && data.columns().iter().all(|col| col.len() == len) } - fn concat(chunks: Vec) -> StreamChunk { - assert!(!chunks.is_empty()); - let mut ops = vec![]; - let mut data_chunks = vec![]; - let mut capacity = 0; + /// Concatenate multiple `StreamChunk` into one. + /// + /// Panics if `chunks` is empty. + pub fn concat(chunks: Vec) -> StreamChunk { + let data_types = chunks[0].data_types(); + let size = chunks.iter().map(|c| c.cardinality()).sum::(); + + // `append_row` will cause the builder to finish immediately once capacity is met. + // Hence, we allocate an extra row here, to avoid the builder finishing prematurely. + // This just makes the code cleaner, since we can loop through all rows, and consume it finally. + // TODO: introduce `new_unlimited` to decouple memory reservation from builder capacity. + let mut builder = StreamChunkBuilder::new(size + 1, data_types); + for chunk in chunks { - capacity += chunk.capacity(); - ops.extend(chunk.ops.iter()); - data_chunks.push(chunk.data); - } - let data = DataChunk::rechunk(&data_chunks, capacity) - .unwrap() - .into_iter() - .next() - .unwrap(); - StreamChunk { - ops: ops.into(), - data, + // TODO: directly append chunks. + for (op, row) in chunk.rows() { + let none = builder.append_row(op, row); + debug_assert!(none.is_none()); + } } + + builder.take().expect("chunk should not be empty") } - fn sort_rows(self) -> Self { + /// Sort rows. + pub fn sort_rows(self) -> Self { if self.capacity() == 0 { return self; } @@ -693,7 +656,7 @@ impl StreamChunkTestExt for StreamChunk { /// Generate `num_of_chunks` data chunks with type `data_types`, /// where each data chunk has cardinality of `chunk_size`. /// TODO(kwannoel): Generate different types of op, different vis. - fn gen_stream_chunks( + pub fn gen_stream_chunks( num_of_chunks: usize, chunk_size: usize, data_types: &[DataType], @@ -709,7 +672,7 @@ impl StreamChunkTestExt for StreamChunk { ) } - fn gen_stream_chunks_inner( + pub fn gen_stream_chunks_inner( num_of_chunks: usize, chunk_size: usize, data_types: &[DataType], diff --git a/src/common/src/array/stream_chunk_builder.rs b/src/common/src/array/stream_chunk_builder.rs new file mode 100644 index 0000000000000..f9e7001bed8e6 --- /dev/null +++ b/src/common/src/array/stream_chunk_builder.rs @@ -0,0 +1,146 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::array::stream_record::Record; +use crate::array::{ArrayBuilderImpl, Op, StreamChunk}; +use crate::row::Row; +use crate::types::{DataType, DatumRef}; +use crate::util::iter_util::ZipEqFast; + +/// Build stream chunks with fixed chunk size from rows or records. +pub struct StreamChunkBuilder { + /// operations in the data chunk to build + ops: Vec, + + /// arrays in the data chunk to build + column_builders: Vec, + + /// Data types of columns + data_types: Vec, + + /// Maximum capacity of column builder + capacity: usize, + + /// Size of column builder + size: usize, +} + +impl Drop for StreamChunkBuilder { + fn drop(&mut self) { + // Possible to fail when async task gets cancelled. + if self.size != 0 { + tracing::warn!( + remaining = self.size, + "dropping non-empty stream chunk builder" + ); + } + } +} + +impl StreamChunkBuilder { + pub fn new(chunk_size: usize, data_types: Vec) -> Self { + assert!(chunk_size > 0); + + let ops = Vec::with_capacity(chunk_size); + let column_builders = data_types + .iter() + .map(|datatype| datatype.create_array_builder(chunk_size)) + .collect(); + Self { + ops, + column_builders, + data_types, + capacity: chunk_size, + size: 0, + } + } + + /// Increase chunk size + /// + /// A [`StreamChunk`] will be returned when `size == capacity` + #[must_use] + fn inc_size(&mut self) -> Option { + self.size += 1; + + // Take a chunk when capacity is exceeded. Splitting `UpdateDelete` and `UpdateInsert` + // should be avoided, so when the last one is `UpdateDelete`, we delay the chunk until + // `UpdateInsert` comes. This means the output chunk size may exceed the given `chunk_size`, + // and theoretically at most `chunk_size + 1` if inputs are consistent. + if self.size >= self.capacity && self.ops[self.ops.len() - 1] != Op::UpdateDelete { + self.take() + } else { + None + } + } + + /// Append an iterator of output index and datum to the builder, return a chunk if the builder + /// is full. + /// + /// Note: the caller must ensure that each column occurs exactly once in `iter`. + #[must_use] + pub fn append_iter<'a>( + &mut self, + op: Op, + iter: impl IntoIterator)>, + ) -> Option { + self.ops.push(op); + for (i, datum) in iter { + self.column_builders[i].append(datum); + } + self.inc_size() + } + + /// Append a row to the builder, return a chunk if the builder is full. + #[must_use] + pub fn append_row(&mut self, op: Op, row: impl Row) -> Option { + self.append_iter(op, row.iter().enumerate()) + } + + /// Append a record to the builder, return a chunk if the builder is full. + #[must_use] + pub fn append_record(&mut self, record: Record) -> Option { + match record { + Record::Insert { new_row } => self.append_row(Op::Insert, new_row), + Record::Delete { old_row } => self.append_row(Op::Delete, old_row), + Record::Update { old_row, new_row } => { + let none = self.append_row(Op::UpdateDelete, old_row); + debug_assert!(none.is_none()); + self.append_row(Op::UpdateInsert, new_row) + } + } + } + + #[must_use] + pub fn take(&mut self) -> Option { + if self.size == 0 { + return None; + } + + self.size = 0; + let new_columns = self + .column_builders + .iter_mut() + .zip_eq_fast(&self.data_types) + .map(|(builder, datatype)| { + std::mem::replace(builder, datatype.create_array_builder(self.capacity)).finish() + }) + .map(Into::into) + .collect::>(); + + Some(StreamChunk::new( + std::mem::replace(&mut self.ops, Vec::with_capacity(self.capacity)), + new_columns, + )) + } +} diff --git a/src/common/src/array/stream_record.rs b/src/common/src/array/stream_record.rs index f9b87adeccf63..1c7b7062962cf 100644 --- a/src/common/src/array/stream_record.rs +++ b/src/common/src/array/stream_record.rs @@ -61,16 +61,13 @@ impl Record { pub fn to_stream_chunk(&self, data_types: &[DataType]) -> StreamChunk { match self { Record::Insert { new_row } => { - StreamChunk::from_rows(&[(Op::Insert, new_row.to_owned_row())], data_types) + StreamChunk::from_rows(&[(Op::Insert, new_row)], data_types) } Record::Delete { old_row } => { - StreamChunk::from_rows(&[(Op::Delete, old_row.to_owned_row())], data_types) + StreamChunk::from_rows(&[(Op::Delete, old_row)], data_types) } Record::Update { old_row, new_row } => StreamChunk::from_rows( - &[ - (Op::UpdateDelete, old_row.to_owned_row()), - (Op::UpdateInsert, new_row.to_owned_row()), - ], + &[(Op::UpdateDelete, old_row), (Op::UpdateInsert, new_row)], data_types, ), } diff --git a/src/common/src/util/chunk_coalesce.rs b/src/common/src/util/chunk_coalesce.rs index 3bd56b19e434d..cb7845816bfeb 100644 --- a/src/common/src/util/chunk_coalesce.rs +++ b/src/common/src/util/chunk_coalesce.rs @@ -39,6 +39,8 @@ pub struct DataChunkBuilder { impl DataChunkBuilder { pub fn new(data_types: Vec, batch_size: usize) -> Self { + assert!(batch_size > 0); + Self { data_types, batch_size, @@ -124,7 +126,8 @@ impl DataChunkBuilder { pub fn append_chunk(&mut self, data_chunk: DataChunk) -> AppendDataChunk<'_> { AppendDataChunk { builder: self, - remaining: Some(SlicedDataChunk::new_checked(data_chunk)), + remaining: (data_chunk.capacity() > 0) // defensive check for empty chunk + .then(|| SlicedDataChunk::new_checked(data_chunk)), } } diff --git a/src/compute/src/rpc/service/stream_service.rs b/src/compute/src/rpc/service/stream_service.rs index 4dceda1719f81..4e1765e32824f 100644 --- a/src/compute/src/rpc/service/stream_service.rs +++ b/src/compute/src/rpc/service/stream_service.rs @@ -247,7 +247,7 @@ impl StreamService for StreamServiceImpl { worker_id: self.env.worker_id(), table_watermarks: table_watermarks .into_iter() - .map(|(key, value)| (key.table_id as u64, value.to_protobuf())) + .map(|(key, value)| (key.table_id, value.to_protobuf())) .collect(), })) } diff --git a/src/frontend/planner_test/tests/testdata/input/emit_on_window_close.yaml b/src/frontend/planner_test/tests/testdata/input/emit_on_window_close.yaml index 158e35850262a..bb917afc60a29 100644 --- a/src/frontend/planner_test/tests/testdata/input/emit_on_window_close.yaml +++ b/src/frontend/planner_test/tests/testdata/input/emit_on_window_close.yaml @@ -40,3 +40,19 @@ WITH (connector = 'blackhole'); expected_outputs: - explain_output +- sql: | + CREATE table s1 (id int, value int, ts TIMESTAMP, WATERMARK FOR ts AS ts - INTERVAL '20' SECOND ) append only; + CREATE table s2 (id int, value int, ts TIMESTAMP, WATERMARK FOR ts AS ts - INTERVAL '20' SECOND) append only; + select *, count(*) over (partition by value2 order by ts2) from ( + SELECT s1.id AS id1, + s1.value AS value1, + s2.id AS id2, + s2.value AS value2, + s1.ts as ts1, + s2.ts as ts2 + FROM s1 JOIN s2 + ON s1.id = s2.id and s1.ts >= s2.ts and s1.ts - INTERVAL '1' MINUTE <= s2.ts + ); + expected_outputs: + - eowc_stream_plan + - stream_plan \ No newline at end of file diff --git a/src/frontend/planner_test/tests/testdata/output/emit_on_window_close.yaml b/src/frontend/planner_test/tests/testdata/output/emit_on_window_close.yaml index cd3019382bd66..acdc201f0c6fb 100644 --- a/src/frontend/planner_test/tests/testdata/output/emit_on_window_close.yaml +++ b/src/frontend/planner_test/tests/testdata/output/emit_on_window_close.yaml @@ -215,3 +215,37 @@ └─StreamEowcSort { sort_column: t.tm } └─StreamExchange { dist: HashShard(t.bar) } └─StreamTableScan { table: t, columns: [tm, foo, bar, _row_id] } +- sql: | + CREATE table s1 (id int, value int, ts TIMESTAMP, WATERMARK FOR ts AS ts - INTERVAL '20' SECOND ) append only; + CREATE table s2 (id int, value int, ts TIMESTAMP, WATERMARK FOR ts AS ts - INTERVAL '20' SECOND) append only; + select *, count(*) over (partition by value2 order by ts2) from ( + SELECT s1.id AS id1, + s1.value AS value1, + s2.id AS id2, + s2.value AS value2, + s1.ts as ts1, + s2.ts as ts2 + FROM s1 JOIN s2 + ON s1.id = s2.id and s1.ts >= s2.ts and s1.ts - INTERVAL '1' MINUTE <= s2.ts + ); + stream_plan: |- + StreamMaterialize { columns: [id1, value1, id2, value2, ts1, ts2, s1._row_id(hidden), s2._row_id(hidden), count], stream_key: [s1._row_id, s2._row_id, id1, value2], pk_columns: [s1._row_id, s2._row_id, id1, value2], pk_conflict: NoCheck } + └─StreamOverWindow { window_functions: [count() OVER(PARTITION BY s2.value ORDER BY s2.ts ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)] } + └─StreamExchange { dist: HashShard(s2.value) } + └─StreamHashJoin [interval, append_only] { type: Inner, predicate: s1.id = s2.id AND (s1.ts >= s2.ts) AND ($expr1 <= s2.ts), conditions_to_clean_left_state_table: (s1.ts >= s2.ts), conditions_to_clean_right_state_table: ($expr1 <= s2.ts), output_watermarks: [s1.ts, s2.ts], output: [s1.id, s1.value, s2.id, s2.value, s1.ts, s2.ts, s1._row_id, s2._row_id] } + ├─StreamExchange { dist: HashShard(s1.id) } + │ └─StreamProject { exprs: [s1.id, s1.value, s1.ts, (s1.ts - '00:01:00':Interval) as $expr1, s1._row_id], output_watermarks: [s1.ts, $expr1] } + │ └─StreamTableScan { table: s1, columns: [s1.id, s1.value, s1.ts, s1._row_id], pk: [s1._row_id], dist: UpstreamHashShard(s1._row_id) } + └─StreamExchange { dist: HashShard(s2.id) } + └─StreamTableScan { table: s2, columns: [s2.id, s2.value, s2.ts, s2._row_id], pk: [s2._row_id], dist: UpstreamHashShard(s2._row_id) } + eowc_stream_plan: |- + StreamMaterialize { columns: [id1, value1, id2, value2, ts1, ts2, s1._row_id(hidden), s2._row_id(hidden), count], stream_key: [s1._row_id, s2._row_id, id1, value2], pk_columns: [s1._row_id, s2._row_id, id1, value2], pk_conflict: NoCheck } + └─StreamEowcOverWindow { window_functions: [count() OVER(PARTITION BY s2.value ORDER BY s2.ts ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)] } + └─StreamEowcSort { sort_column: s2.ts } + └─StreamExchange { dist: HashShard(s2.value) } + └─StreamHashJoin [interval, append_only] { type: Inner, predicate: s1.id = s2.id AND (s1.ts >= s2.ts) AND ($expr1 <= s2.ts), conditions_to_clean_left_state_table: (s1.ts >= s2.ts), conditions_to_clean_right_state_table: ($expr1 <= s2.ts), output_watermarks: [s1.ts, s2.ts], output: [s1.id, s1.value, s2.id, s2.value, s1.ts, s2.ts, s1._row_id, s2._row_id] } + ├─StreamExchange { dist: HashShard(s1.id) } + │ └─StreamProject { exprs: [s1.id, s1.value, s1.ts, (s1.ts - '00:01:00':Interval) as $expr1, s1._row_id], output_watermarks: [s1.ts, $expr1] } + │ └─StreamTableScan { table: s1, columns: [s1.id, s1.value, s1.ts, s1._row_id], pk: [s1._row_id], dist: UpstreamHashShard(s1._row_id) } + └─StreamExchange { dist: HashShard(s2.id) } + └─StreamTableScan { table: s2, columns: [s2.id, s2.value, s2.ts, s2._row_id], pk: [s2._row_id], dist: UpstreamHashShard(s2._row_id) } diff --git a/src/frontend/planner_test/tests/testdata/output/nexmark_watermark.yaml b/src/frontend/planner_test/tests/testdata/output/nexmark_watermark.yaml index 53df7f872633e..ba5b6198f01db 100644 --- a/src/frontend/planner_test/tests/testdata/output/nexmark_watermark.yaml +++ b/src/frontend/planner_test/tests/testdata/output/nexmark_watermark.yaml @@ -1018,9 +1018,9 @@ └─BatchProject { exprs: [event_type, person, auction, bid, Case((event_type = 0:Int32), Field(person, 6:Int32), (event_type = 1:Int32), Field(auction, 5:Int32), Field(bid, 5:Int32)) as $expr10, _row_id] } └─BatchSource { source: nexmark, columns: [event_type, person, auction, bid, _row_id], filter: (None, None) } stream_plan: |- - StreamMaterialize { columns: [id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, auction, bidder, price, bid_date_time, _row_id(hidden), _row_id#1(hidden)], stream_key: [id], pk_columns: [id], pk_conflict: NoCheck, watermark_columns: [bid_date_time] } - └─StreamGroupTopN [append_only] { order: [$expr12 DESC, $expr1 ASC], limit: 1, offset: 0, group_key: [$expr2], output_watermarks: [$expr1] } - └─StreamHashJoin [append_only] { type: Inner, predicate: $expr2 = $expr10 AND ($expr1 >= $expr1) AND ($expr1 <= $expr7), conditions_to_clean_right_state_table: ($expr1 >= $expr1), output_watermarks: [$expr1], output: [$expr2, $expr3, $expr4, $expr5, $expr6, $expr1, $expr7, $expr8, $expr9, $expr10, $expr11, $expr12, $expr1, _row_id, _row_id] } + StreamMaterialize { columns: [id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, auction, bidder, price, bid_date_time, _row_id(hidden), _row_id#1(hidden)], stream_key: [id], pk_columns: [id], pk_conflict: NoCheck, watermark_columns: [date_time, bid_date_time] } + └─StreamGroupTopN [append_only] { order: [$expr12 DESC, $expr1 ASC], limit: 1, offset: 0, group_key: [$expr2], output_watermarks: [$expr1, $expr1] } + └─StreamHashJoin [append_only] { type: Inner, predicate: $expr2 = $expr10 AND ($expr1 >= $expr1) AND ($expr1 <= $expr7), conditions_to_clean_right_state_table: ($expr1 >= $expr1), output_watermarks: [$expr1, $expr1], output: [$expr2, $expr3, $expr4, $expr5, $expr6, $expr1, $expr7, $expr8, $expr9, $expr10, $expr11, $expr12, $expr1, _row_id, _row_id] } ├─StreamExchange { dist: HashShard($expr2) } │ └─StreamProject { exprs: [Field(auction, 0:Int32) as $expr2, Field(auction, 1:Int32) as $expr3, Field(auction, 2:Int32) as $expr4, Field(auction, 3:Int32) as $expr5, Field(auction, 4:Int32) as $expr6, $expr1, Field(auction, 6:Int32) as $expr7, Field(auction, 7:Int32) as $expr8, Field(auction, 8:Int32) as $expr9, _row_id], output_watermarks: [$expr1] } │ └─StreamFilter { predicate: (event_type = 1:Int32) } @@ -1043,9 +1043,9 @@ └─StreamSource { source: nexmark, columns: [event_type, person, auction, bid, _row_id] } stream_dist_plan: |+ Fragment 0 - StreamMaterialize { columns: [id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, auction, bidder, price, bid_date_time, _row_id(hidden), _row_id#1(hidden)], stream_key: [id], pk_columns: [id], pk_conflict: NoCheck, watermark_columns: [bid_date_time] } { materialized table: 4294967294 } - └── StreamGroupTopN [append_only] { order: [$expr12 DESC, $expr1 ASC], limit: 1, offset: 0, group_key: [$expr2], output_watermarks: [$expr1] } { state table: 0 } - └── StreamHashJoin [append_only] { type: Inner, predicate: $expr2 = $expr10 AND ($expr1 >= $expr1) AND ($expr1 <= $expr7), conditions_to_clean_right_state_table: ($expr1 >= $expr1), output_watermarks: [$expr1], output: [$expr2, $expr3, $expr4, $expr5, $expr6, $expr1, $expr7, $expr8, $expr9, $expr10, $expr11, $expr12, $expr1, _row_id, _row_id] } + StreamMaterialize { columns: [id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, auction, bidder, price, bid_date_time, _row_id(hidden), _row_id#1(hidden)], stream_key: [id], pk_columns: [id], pk_conflict: NoCheck, watermark_columns: [date_time, bid_date_time] } { materialized table: 4294967294 } + └── StreamGroupTopN [append_only] { order: [$expr12 DESC, $expr1 ASC], limit: 1, offset: 0, group_key: [$expr2], output_watermarks: [$expr1, $expr1] } { state table: 0 } + └── StreamHashJoin [append_only] { type: Inner, predicate: $expr2 = $expr10 AND ($expr1 >= $expr1) AND ($expr1 <= $expr7), conditions_to_clean_right_state_table: ($expr1 >= $expr1), output_watermarks: [$expr1, $expr1], output: [$expr2, $expr3, $expr4, $expr5, $expr6, $expr1, $expr7, $expr8, $expr9, $expr10, $expr11, $expr12, $expr1, _row_id, _row_id] } ├── left table: 1 ├── right table: 3 ├── left degree table: 2 @@ -1088,10 +1088,10 @@ Table 4294967294 { columns: [ id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, auction, bidder, price, bid_date_time, _row_id, _row_id#1 ], primary key: [ $0 ASC ], value indices: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 ], distribution key: [ 0 ], read pk prefix len hint: 1 } eowc_stream_plan: |- - StreamMaterialize { columns: [id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, auction, bidder, price, bid_date_time, _row_id(hidden), _row_id#1(hidden)], stream_key: [id], pk_columns: [id], pk_conflict: NoCheck, watermark_columns: [bid_date_time] } + StreamMaterialize { columns: [id, item_name, description, initial_bid, reserve, date_time, expires, seller, category, auction, bidder, price, bid_date_time, _row_id(hidden), _row_id#1(hidden)], stream_key: [id], pk_columns: [id], pk_conflict: NoCheck, watermark_columns: [date_time] } └─StreamEowcSort { sort_column: $expr1 } - └─StreamGroupTopN [append_only] { order: [$expr12 DESC, $expr1 ASC], limit: 1, offset: 0, group_key: [$expr2], output_watermarks: [$expr1] } - └─StreamHashJoin [append_only] { type: Inner, predicate: $expr2 = $expr10 AND ($expr1 >= $expr1) AND ($expr1 <= $expr7), conditions_to_clean_right_state_table: ($expr1 >= $expr1), output_watermarks: [$expr1], output: [$expr2, $expr3, $expr4, $expr5, $expr6, $expr1, $expr7, $expr8, $expr9, $expr10, $expr11, $expr12, $expr1, _row_id, _row_id] } + └─StreamGroupTopN [append_only] { order: [$expr12 DESC, $expr1 ASC], limit: 1, offset: 0, group_key: [$expr2], output_watermarks: [$expr1, $expr1] } + └─StreamHashJoin [append_only] { type: Inner, predicate: $expr2 = $expr10 AND ($expr1 >= $expr1) AND ($expr1 <= $expr7), conditions_to_clean_right_state_table: ($expr1 >= $expr1), output_watermarks: [$expr1, $expr1], output: [$expr2, $expr3, $expr4, $expr5, $expr6, $expr1, $expr7, $expr8, $expr9, $expr10, $expr11, $expr12, $expr1, _row_id, _row_id] } ├─StreamExchange { dist: HashShard($expr2) } │ └─StreamProject { exprs: [Field(auction, 0:Int32) as $expr2, Field(auction, 1:Int32) as $expr3, Field(auction, 2:Int32) as $expr4, Field(auction, 3:Int32) as $expr5, Field(auction, 4:Int32) as $expr6, $expr1, Field(auction, 6:Int32) as $expr7, Field(auction, 7:Int32) as $expr8, Field(auction, 8:Int32) as $expr9, _row_id], output_watermarks: [$expr1] } │ └─StreamFilter { predicate: (event_type = 1:Int32) } diff --git a/src/frontend/src/catalog/table_catalog.rs b/src/frontend/src/catalog/table_catalog.rs index c8b7b4ef437e4..63fd9e5496919 100644 --- a/src/frontend/src/catalog/table_catalog.rs +++ b/src/frontend/src/catalog/table_catalog.rs @@ -119,8 +119,7 @@ pub struct TableCatalog { /// `None`. pub row_id_index: Option, - /// The column indices which are stored in the state store's value with row-encoding. Currently - /// is not supported yet and expected to be `[0..columns.len()]`. + /// The column indices which are stored in the state store's value with row-encoding. pub value_indices: Vec, /// The full `CREATE TABLE` or `CREATE MATERIALIZED VIEW` definition of the table. diff --git a/src/frontend/src/handler/create_index.rs b/src/frontend/src/handler/create_index.rs index 006230552ea02..1553a84c1a09b 100644 --- a/src/frontend/src/handler/create_index.rs +++ b/src/frontend/src/handler/create_index.rs @@ -14,6 +14,7 @@ use std::collections::{HashMap, HashSet}; use std::rc::Rc; +use std::sync::Arc; use either::Either; use fixedbitset::FixedBitSet; @@ -41,6 +42,7 @@ use crate::optimizer::{OptimizerContext, OptimizerContextRef, PlanRef, PlanRoot} use crate::scheduler::streaming_manager::CreatingStreamingJobInfo; use crate::session::SessionImpl; use crate::stream_fragmenter::build_graph; +use crate::TableCatalog; pub(crate) fn gen_create_index_plan( session: &SessionImpl, @@ -182,7 +184,7 @@ pub(crate) fn gen_create_index_plan( // Manually assemble the materialization plan for the index MV. let materialize = assemble_materialize( table_name, - table_desc.clone(), + table.clone(), context, index_table_name.clone(), &index_columns_ordered_expr, @@ -308,7 +310,7 @@ fn build_index_item( /// `distributed_by_columns_len` to represent distributed by columns fn assemble_materialize( table_name: String, - table_desc: Rc, + table_catalog: Arc, context: OptimizerContextRef, index_name: String, index_columns: &[(ExprImpl, OrderType)], @@ -324,7 +326,7 @@ fn assemble_materialize( let logical_scan = LogicalScan::create( table_name, - table_desc.clone(), + table_catalog.clone(), // Index table has no indexes. vec![], context, @@ -348,12 +350,12 @@ fn assemble_materialize( let out_names: Vec = index_columns .iter() .map(|(expr, _)| match expr { - ExprImpl::InputRef(input_ref) => table_desc - .columns + ExprImpl::InputRef(input_ref) => table_catalog + .columns() .get(input_ref.index) .unwrap() - .name - .clone(), + .name() + .to_string(), ExprImpl::FunctionCall(func) => { let func_name = func.func_type().as_str_name().to_string(); let mut name = func_name.clone(); @@ -367,12 +369,12 @@ fn assemble_materialize( }) .chain(include_columns.iter().map(|expr| { match expr { - ExprImpl::InputRef(input_ref) => table_desc - .columns + ExprImpl::InputRef(input_ref) => table_catalog + .columns() .get(input_ref.index) .unwrap() - .name - .clone(), + .name() + .to_string(), _ => unreachable!(), } })) diff --git a/src/frontend/src/handler/create_mv.rs b/src/frontend/src/handler/create_mv.rs index a504a92111cc4..716b3c0cdd852 100644 --- a/src/frontend/src/handler/create_mv.rs +++ b/src/frontend/src/handler/create_mv.rs @@ -189,6 +189,7 @@ It only indicates the physical clustering of the data, which may improve the per if plan.inputs().is_empty() { if let Some(scan) = plan.as_stream_table_scan() { scan.stream_scan_type() == StreamScanType::Backfill + || scan.stream_scan_type() == StreamScanType::ArrangementBackfill } else { false } diff --git a/src/frontend/src/optimizer/plan_node/generic/scan.rs b/src/frontend/src/optimizer/plan_node/generic/scan.rs index c32d942242683..cb033f80e0c6b 100644 --- a/src/frontend/src/optimizer/plan_node/generic/scan.rs +++ b/src/frontend/src/optimizer/plan_node/generic/scan.rs @@ -14,6 +14,7 @@ use std::collections::{BTreeMap, HashMap}; use std::rc::Rc; +use std::sync::Arc; use educe::Educe; use fixedbitset::FixedBitSet; @@ -28,6 +29,7 @@ use crate::expr::{Expr, ExprImpl, ExprRewriter, ExprVisitor, FunctionCall, Input use crate::optimizer::optimizer_context::OptimizerContextRef; use crate::optimizer::property::{Cardinality, FunctionalDependencySet, Order}; use crate::utils::{ColIndexMappingRewriteExt, Condition}; +use crate::TableCatalog; /// [`Scan`] returns contents of a table or other equivalent object #[derive(Debug, Clone, Educe)] @@ -37,7 +39,14 @@ pub struct Scan { /// Include `output_col_idx` and columns required in `predicate` pub required_col_idx: Vec, pub output_col_idx: Vec, - /// Descriptor of the table + /// Table Catalog of the upstream table that the descriptor is derived from. + pub table_catalog: Arc, + // FIXME(kwannoel): Currently many places in the code reference this, + // but now we have table catalog. + // We should remove this and use table catalog in those call-sites instead. + // It's introduced in https://github.com/risingwavelabs/risingwave/pull/13622. + // We kept this field to avoid extensive refactor in that PR. + /// Table Desc (subset of table catalog). pub table_desc: Rc, /// Descriptors of all indexes on this table pub indexes: Vec>, @@ -172,7 +181,7 @@ impl Scan { pub fn to_index_scan( &self, index_name: &str, - index_table_desc: Rc, + index_table_catalog: Arc, primary_to_secondary_mapping: &BTreeMap, function_mapping: &HashMap, ) -> Self { @@ -221,7 +230,7 @@ impl Scan { Self::new( index_name.to_string(), new_output_col_idx, - index_table_desc, + index_table_catalog, vec![], self.ctx.clone(), new_predicate, @@ -235,7 +244,7 @@ impl Scan { pub(crate) fn new( table_name: String, output_col_idx: Vec, // the column index in the table - table_desc: Rc, + table_catalog: Arc, indexes: Vec>, ctx: OptimizerContextRef, predicate: Condition, // refers to column indexes of the table @@ -245,7 +254,7 @@ impl Scan { Self::new_inner( table_name, output_col_idx, - table_desc, + table_catalog, indexes, ctx, predicate, @@ -258,7 +267,7 @@ impl Scan { pub(crate) fn new_inner( table_name: String, output_col_idx: Vec, // the column index in the table - table_desc: Rc, + table_catalog: Arc, indexes: Vec>, ctx: OptimizerContextRef, predicate: Condition, // refers to column indexes of the table @@ -274,17 +283,20 @@ impl Scan { // required columns, i.e., the mapping from operator_idx to table_idx. let mut required_col_idx = output_col_idx.clone(); - let predicate_col_idx = predicate.collect_input_refs(table_desc.columns.len()); + let predicate_col_idx = predicate.collect_input_refs(table_catalog.columns().len()); predicate_col_idx.ones().for_each(|idx| { if !required_col_idx.contains(&idx) { required_col_idx.push(idx); } }); + let table_desc = Rc::new(table_catalog.table_desc()); + Self { table_name, required_col_idx, output_col_idx, + table_catalog, table_desc, indexes, predicate, diff --git a/src/frontend/src/optimizer/plan_node/logical_scan.rs b/src/frontend/src/optimizer/plan_node/logical_scan.rs index 269633d5d74bd..36995ad4a3fe6 100644 --- a/src/frontend/src/optimizer/plan_node/logical_scan.rs +++ b/src/frontend/src/optimizer/plan_node/logical_scan.rs @@ -14,6 +14,7 @@ use std::collections::{BTreeMap, HashSet}; use std::rc::Rc; +use std::sync::Arc; use fixedbitset::FixedBitSet; use itertools::Itertools; @@ -21,6 +22,7 @@ use pretty_xmlish::{Pretty, XmlNode}; use risingwave_common::catalog::{ColumnDesc, TableDesc}; use risingwave_common::error::Result; use risingwave_common::util::sort_util::ColumnOrder; +use risingwave_pb::stream_plan::StreamScanType; use super::generic::{GenericPlanNode, GenericPlanRef}; use super::utils::{childless_record, Distill}; @@ -39,6 +41,7 @@ use crate::optimizer::plan_node::{ use crate::optimizer::property::{Cardinality, Order}; use crate::optimizer::rule::IndexSelectionRule; use crate::utils::{ColIndexMapping, Condition, ConditionDisplay}; +use crate::TableCatalog; /// `LogicalScan` returns contents of a table or other equivalent object #[derive(Debug, Clone, PartialEq, Eq, Hash)] @@ -64,16 +67,17 @@ impl LogicalScan { /// Create a [`LogicalScan`] node. Used by planner. pub fn create( table_name: String, // explain-only - table_desc: Rc, + table_catalog: Arc, indexes: Vec>, ctx: OptimizerContextRef, for_system_time_as_of_proctime: bool, table_cardinality: Cardinality, ) -> Self { + let output_col_idx: Vec = (0..table_catalog.columns().len()).collect(); generic::Scan::new( table_name, - (0..table_desc.columns.len()).collect(), - table_desc, + output_col_idx, + table_catalog, indexes, ctx, Condition::true_cond(), @@ -96,11 +100,17 @@ impl LogicalScan { self.core.table_cardinality } + // FIXME(kwannoel): Fetch from `table_catalog` + lazily instantiate? /// Get a reference to the logical scan's table desc. pub fn table_desc(&self) -> &TableDesc { self.core.table_desc.as_ref() } + /// FIXME + pub fn table_catalog(&self) -> Arc { + self.core.table_catalog.clone() + } + /// Get the descs of the output columns. pub fn column_descs(&self) -> Vec { self.core.column_descs() @@ -183,7 +193,7 @@ impl LogicalScan { { let index_scan = self.core.to_index_scan( &index.name, - index.index_table.table_desc().into(), + index.index_table.clone(), p2s_mapping, index.function_mapping(), ); @@ -235,7 +245,7 @@ impl LogicalScan { let scan_without_predicate = generic::Scan::new( self.table_name().to_string(), self.required_col_idx().to_vec(), - self.core.table_desc.clone(), + self.core.table_catalog.clone(), // FIXME self.indexes().to_vec(), self.ctx(), Condition::true_cond(), @@ -254,7 +264,7 @@ impl LogicalScan { generic::Scan::new_inner( self.table_name().to_string(), self.output_col_idx().to_vec(), - self.core.table_desc.clone(), + self.table_catalog(), self.indexes().to_vec(), self.base.ctx().clone(), predicate, @@ -268,7 +278,7 @@ impl LogicalScan { generic::Scan::new_inner( self.table_name().to_string(), output_col_idx, - self.core.table_desc.clone(), + self.core.table_catalog.clone(), self.indexes().to_vec(), self.base.ctx().clone(), self.predicate().clone(), @@ -512,7 +522,20 @@ impl ToBatch for LogicalScan { impl ToStream for LogicalScan { fn to_stream(&self, ctx: &mut ToStreamContext) -> Result { if self.predicate().always_true() { - Ok(StreamTableScan::new(self.core.clone()).into()) + if self + .ctx() + .session_ctx() + .config() + .streaming_enable_arrangement_backfill() + { + Ok(StreamTableScan::new_with_stream_scan_type( + self.core.clone(), + StreamScanType::ArrangementBackfill, + ) + .into()) + } else { + Ok(StreamTableScan::new(self.core.clone()).into()) + } } else { let (scan, predicate, project_expr) = self.predicate_pull_up(); let mut plan = LogicalFilter::create(scan.into(), predicate); diff --git a/src/frontend/src/optimizer/plan_node/stream_hash_join.rs b/src/frontend/src/optimizer/plan_node/stream_hash_join.rs index 514b3dfa7df1a..f83e56440fa66 100644 --- a/src/frontend/src/optimizer/plan_node/stream_hash_join.rs +++ b/src/frontend/src/optimizer/plan_node/stream_hash_join.rs @@ -136,38 +136,46 @@ impl StreamHashJoin { continue; } - let (internal, do_state_cleaning) = if key_required_larger < key_required_smaller { - ( - l2i.try_map(key_required_larger), - if !equal_condition_clean_state - && clean_left_state_conjunction_idx.is_none() - { - clean_left_state_conjunction_idx = Some(conjunction_idx); - true - } else { - false - }, - ) - } else { - ( - r2i.try_map(key_required_larger - left_cols_num), - if !equal_condition_clean_state - && clean_right_state_conjunction_idx.is_none() - { - clean_right_state_conjunction_idx = Some(conjunction_idx); - true - } else { - false - }, - ) - }; + let (internal_col1, internal_col2, do_state_cleaning) = + if key_required_larger < key_required_smaller { + ( + l2i.try_map(key_required_larger), + r2i.try_map(key_required_smaller - left_cols_num), + if !equal_condition_clean_state + && clean_left_state_conjunction_idx.is_none() + { + clean_left_state_conjunction_idx = Some(conjunction_idx); + true + } else { + false + }, + ) + } else { + ( + r2i.try_map(key_required_larger - left_cols_num), + l2i.try_map(key_required_smaller), + if !equal_condition_clean_state + && clean_right_state_conjunction_idx.is_none() + { + clean_right_state_conjunction_idx = Some(conjunction_idx); + true + } else { + false + }, + ) + }; let mut is_valuable_inequality = do_state_cleaning; - if let Some(internal) = internal + if let Some(internal) = internal_col1 && !watermark_columns.contains(internal) { watermark_columns.insert(internal); is_valuable_inequality = true; } + if let Some(internal) = internal_col2 + && !watermark_columns.contains(internal) + { + watermark_columns.insert(internal); + } if is_valuable_inequality { inequality_pairs.push(( do_state_cleaning, diff --git a/src/frontend/src/optimizer/plan_node/stream_table_scan.rs b/src/frontend/src/optimizer/plan_node/stream_table_scan.rs index b40f1f758c3e4..7dfff36ed7af8 100644 --- a/src/frontend/src/optimizer/plan_node/stream_table_scan.rs +++ b/src/frontend/src/optimizer/plan_node/stream_table_scan.rs @@ -13,11 +13,11 @@ // limitations under the License. use std::collections::{BTreeMap, HashMap}; -use std::rc::Rc; +use std::sync::Arc; use itertools::Itertools; use pretty_xmlish::{Pretty, XmlNode}; -use risingwave_common::catalog::{Field, TableDesc}; +use risingwave_common::catalog::Field; use risingwave_common::hash::VirtualNode; use risingwave_common::types::DataType; use risingwave_common::util::sort_util::OrderType; @@ -97,14 +97,14 @@ impl StreamTableScan { pub fn to_index_scan( &self, index_name: &str, - index_table_desc: Rc, + index_table_catalog: Arc, primary_to_secondary_mapping: &BTreeMap, function_mapping: &HashMap, stream_scan_type: StreamScanType, ) -> StreamTableScan { let logical_index_scan = self.core.to_index_scan( index_name, - index_table_desc, + index_table_catalog, primary_to_secondary_mapping, function_mapping, ); @@ -118,6 +118,11 @@ impl StreamTableScan { self.stream_scan_type } + // TODO: Add note to reviewer about safety, because of `generic::Scan` limitation. + fn get_upstream_state_table(&self) -> &TableCatalog { + self.core.table_catalog.as_ref() + } + /// Build catalog for backfill state /// /// Schema: | vnode | pk ... | `backfill_finished` | `row_count` | @@ -244,7 +249,9 @@ impl StreamTableScan { // The required columns from the table (both scan and upstream). let upstream_column_ids = match self.stream_scan_type { // For backfill, we additionally need the primary key columns. - StreamScanType::Backfill => self.core.output_and_pk_column_ids(), + StreamScanType::Backfill | StreamScanType::ArrangementBackfill => { + self.core.output_and_pk_column_ids() + } StreamScanType::Chain | StreamScanType::Rearrange | StreamScanType::UpstreamOnly => { self.core.output_column_ids() } @@ -270,6 +277,19 @@ impl StreamTableScan { let upstream_schema = snapshot_schema.clone(); + // TODO: snapshot read of upstream mview + let batch_plan_node = BatchPlanNode { + table_desc: Some(self.core.table_desc.to_protobuf()), + column_ids: upstream_column_ids.clone(), + }; + + let catalog = self + .build_backfill_state_catalog(state) + .to_internal_table_prost(); + + // For backfill, we first read pk + output_indices from upstream. + // On this, we need to further project `output_indices` to the downstream. + // This `output_indices` refers to that. let output_indices = self .core .output_column_ids() @@ -282,16 +302,14 @@ impl StreamTableScan { }) .collect_vec(); - // TODO: snapshot read of upstream mview - let batch_plan_node = BatchPlanNode { - table_desc: Some(self.core.table_desc.to_protobuf()), - column_ids: upstream_column_ids.clone(), + // This refers to the output indices of the originating stream. + let upstream_table_catalog = self.get_upstream_state_table().clone(); + let arrangement_table = if self.stream_scan_type == StreamScanType::ArrangementBackfill { + Some(upstream_table_catalog.to_internal_table_prost()) + } else { + None }; - let catalog = self - .build_backfill_state_catalog(state) - .to_internal_table_prost(); - let node_body = PbNodeBody::StreamScan(StreamScanNode { table_id: self.core.table_desc.table_id.table_id, stream_scan_type: self.stream_scan_type as i32, @@ -301,6 +319,7 @@ impl StreamTableScan { // The table desc used by backfill executor table_desc: Some(self.core.table_desc.to_protobuf()), state_table: Some(catalog), + arrangement_table, rate_limit: self.base.ctx().overwrite_options().streaming_rate_limit, ..Default::default() }); @@ -308,6 +327,7 @@ impl StreamTableScan { PbStreamNode { fields: self.schema().to_prost(), input: vec![ + // Upstream updates // The merge node body will be filled by the `ActorBuilder` on the meta service. PbStreamNode { node_body: Some(PbNodeBody::Merge(Default::default())), @@ -316,6 +336,7 @@ impl StreamTableScan { stream_key: vec![], // not used ..Default::default() }, + // Snapshot read PbStreamNode { node_body: Some(PbNodeBody::BatchPlan(batch_plan_node)), operator_id: self.batch_plan_id.0 as u64, @@ -326,7 +347,6 @@ impl StreamTableScan { append_only: true, }, ], - node_body: Some(node_body), stream_key, operator_id: self.base.id().0 as u64, diff --git a/src/frontend/src/optimizer/rule/index_delta_join_rule.rs b/src/frontend/src/optimizer/rule/index_delta_join_rule.rs index c62e15220cbf1..56dcd17692d30 100644 --- a/src/frontend/src/optimizer/rule/index_delta_join_rule.rs +++ b/src/frontend/src/optimizer/rule/index_delta_join_rule.rs @@ -93,7 +93,7 @@ impl Rule for IndexDeltaJoinRule { table_scan .to_index_scan( index.index_table.name.as_str(), - index.index_table.table_desc().into(), + index.index_table.clone(), p2s_mapping, index.function_mapping(), stream_scan_type, diff --git a/src/frontend/src/optimizer/rule/index_selection_rule.rs b/src/frontend/src/optimizer/rule/index_selection_rule.rs index 15d45fda096e9..1e61baf64b82b 100644 --- a/src/frontend/src/optimizer/rule/index_selection_rule.rs +++ b/src/frontend/src/optimizer/rule/index_selection_rule.rs @@ -227,7 +227,7 @@ impl IndexSelectionRule { let index_scan = LogicalScan::create( index.index_table.name.clone(), - index.index_table.table_desc().into(), + index.index_table.clone(), vec![], logical_scan.ctx(), false, @@ -236,7 +236,7 @@ impl IndexSelectionRule { let primary_table_scan = LogicalScan::create( index.primary_table.name.clone(), - index.primary_table.table_desc().into(), + (*index.primary_table).clone().into(), vec![], logical_scan.ctx(), false, @@ -335,7 +335,7 @@ impl IndexSelectionRule { let primary_table_scan = LogicalScan::create( logical_scan.table_name().to_string(), - primary_table_desc.clone().into(), + logical_scan.table_catalog(), vec![], logical_scan.ctx(), false, @@ -567,7 +567,8 @@ impl IndexSelectionRule { .iter() .map(|x| x.column_index) .collect_vec(), - primary_table_desc.clone().into(), + // TODO: Should these be cloning the underlying rc instead? + logical_scan.table_catalog(), vec![], logical_scan.ctx(), Condition { @@ -609,7 +610,7 @@ impl IndexSelectionRule { .iter() .map(|x| x.column_index) .collect_vec(), - index.index_table.table_desc().into(), + index.index_table.clone(), vec![], ctx, new_predicate, diff --git a/src/frontend/src/planner/relation.rs b/src/frontend/src/planner/relation.rs index 20a682bf7fb69..d009bd0b5d7bb 100644 --- a/src/frontend/src/planner/relation.rs +++ b/src/frontend/src/planner/relation.rs @@ -67,17 +67,19 @@ impl Planner { } pub(super) fn plan_base_table(&mut self, base_table: &BoundBaseTable) -> Result { + let for_system_time_as_of_proctime = base_table.for_system_time_as_of_proctime; + let table_cardinality = base_table.table_catalog.cardinality; Ok(LogicalScan::create( base_table.table_catalog.name().to_string(), - Rc::new(base_table.table_catalog.table_desc()), + base_table.table_catalog.clone().into(), base_table .table_indexes .iter() .map(|x| x.as_ref().clone().into()) .collect(), self.ctx(), - base_table.for_system_time_as_of_proctime, - base_table.table_catalog.cardinality, + for_system_time_as_of_proctime, + table_cardinality, ) .into()) } diff --git a/src/frontend/src/scheduler/distributed/query.rs b/src/frontend/src/scheduler/distributed/query.rs index 003c19d2ec9ac..347fcf38ce5f8 100644 --- a/src/frontend/src/scheduler/distributed/query.rs +++ b/src/frontend/src/scheduler/distributed/query.rs @@ -437,11 +437,13 @@ impl QueryRunner { #[cfg(test)] pub(crate) mod tests { use std::collections::HashMap; - use std::rc::Rc; use std::sync::{Arc, RwLock}; use fixedbitset::FixedBitSet; - use risingwave_common::catalog::{ColumnDesc, TableDesc}; + use risingwave_common::catalog::hummock::PROPERTIES_RETENTION_SECOND_KEY; + use risingwave_common::catalog::{ + ColumnCatalog, ColumnDesc, ConflictBehavior, DEFAULT_SUPER_USER_ID, + }; use risingwave_common::constants::hummock::TABLE_OPTION_DUMMY_RETENTION_SECOND; use risingwave_common::hash::ParallelUnitMapping; use risingwave_common::types::DataType; @@ -452,6 +454,7 @@ pub(crate) mod tests { use crate::catalog::catalog_service::CatalogReader; use crate::catalog::root_catalog::Catalog; + use crate::catalog::table_catalog::{CreateType, TableType}; use crate::expr::InputRef; use crate::optimizer::plan_node::{ generic, BatchExchange, BatchFilter, BatchHashJoin, EqJoinPredicate, LogicalScan, ToBatch, @@ -468,6 +471,7 @@ pub(crate) mod tests { use crate::session::SessionImpl; use crate::test_utils::MockFrontendMetaClient; use crate::utils::Condition; + use crate::{TableCatalog, WithOptions}; #[tokio::test] async fn test_query_should_not_hang_with_empty_worker() { @@ -514,25 +518,60 @@ pub(crate) mod tests { // let ctx = OptimizerContext::mock().await; let table_id = 0.into(); + let table_catalog: TableCatalog = TableCatalog { + id: table_id, + associated_source_id: None, + name: "test".to_string(), + columns: vec![ + ColumnCatalog { + column_desc: ColumnDesc::new_atomic(DataType::Int32, "a", 0), + is_hidden: false, + }, + ColumnCatalog { + column_desc: ColumnDesc::new_atomic(DataType::Float64, "b", 1), + is_hidden: false, + }, + ColumnCatalog { + column_desc: ColumnDesc::new_atomic(DataType::Int64, "c", 2), + is_hidden: false, + }, + ], + pk: vec![], + stream_key: vec![], + table_type: TableType::Table, + distribution_key: vec![], + append_only: false, + owner: DEFAULT_SUPER_USER_ID, + properties: WithOptions::new( + [( + PROPERTIES_RETENTION_SECOND_KEY.into(), + TABLE_OPTION_DUMMY_RETENTION_SECOND.to_string(), + )] + .into_iter() + .collect(), + ), + fragment_id: 0, // FIXME + dml_fragment_id: None, // FIXME + vnode_col_index: None, + row_id_index: None, + value_indices: vec![0, 1, 2], + definition: "".to_string(), + conflict_behavior: ConflictBehavior::NoCheck, + read_prefix_len_hint: 0, + version: None, + watermark_columns: FixedBitSet::with_capacity(3), + dist_key_in_pk: vec![], + cardinality: Cardinality::unknown(), + cleaned_by_watermark: false, + created_at_epoch: None, + initialized_at_epoch: None, + create_type: CreateType::Foreground, + description: None, + incoming_sinks: vec![], + }; let batch_plan_node: PlanRef = LogicalScan::create( "".to_string(), - Rc::new(TableDesc { - table_id, - stream_key: vec![], - pk: vec![], - columns: vec![ - ColumnDesc::new_atomic(DataType::Int32, "a", 0), - ColumnDesc::new_atomic(DataType::Float64, "b", 1), - ColumnDesc::new_atomic(DataType::Int64, "c", 2), - ], - distribution_key: vec![], - append_only: false, - retention_seconds: TABLE_OPTION_DUMMY_RETENTION_SECOND, - value_indices: vec![0, 1, 2], - read_prefix_len_hint: 0, - watermark_columns: FixedBitSet::with_capacity(3), - versioned: false, - }), + table_catalog.into(), vec![], ctx, false, diff --git a/src/meta/src/hummock/manager/mod.rs b/src/meta/src/hummock/manager/mod.rs index 3bd53d67f07d8..06dcd63b69313 100644 --- a/src/meta/src/hummock/manager/mod.rs +++ b/src/meta/src/hummock/manager/mod.rs @@ -268,14 +268,14 @@ pub enum CompactionResumeTrigger { pub struct CommitEpochInfo { pub sstables: Vec, - pub new_table_watermarks: HashMap, + pub new_table_watermarks: HashMap, pub sst_to_context: HashMap, } impl CommitEpochInfo { pub fn new( sstables: Vec, - new_table_watermarks: HashMap, + new_table_watermarks: HashMap, sst_to_context: HashMap, ) -> Self { Self { @@ -975,6 +975,8 @@ impl HummockManager { .retain(|table_id, _| compact_task.existing_table_ids.contains(table_id)); compact_task.table_vnode_partition = table_to_vnode_partition; + compact_task.table_watermarks = + current_version.safe_epoch_table_watermarks(&compact_task.existing_table_ids); let mut compact_task_assignment = BTreeMapTransaction::new(&mut compaction.compact_task_assignment); diff --git a/src/meta/src/hummock/mock_hummock_meta_client.rs b/src/meta/src/hummock/mock_hummock_meta_client.rs index e85157ef9b03e..9b0b66fc1e03f 100644 --- a/src/meta/src/hummock/mock_hummock_meta_client.rs +++ b/src/meta/src/hummock/mock_hummock_meta_client.rs @@ -94,7 +94,7 @@ impl MockHummockMetaClient { &self, epoch: HummockEpoch, sstables: Vec, - new_table_watermarks: HashMap, + new_table_watermarks: HashMap, ) -> Result<()> { let sst_to_worker = sstables .iter() diff --git a/src/meta/src/manager/catalog/fragment.rs b/src/meta/src/manager/catalog/fragment.rs index 4873420a07aa8..1a95d84371cb2 100644 --- a/src/meta/src/manager/catalog/fragment.rs +++ b/src/meta/src/manager/catalog/fragment.rs @@ -184,6 +184,7 @@ impl FragmentManager { let is_backfill = if let Some(node) = &stream_node.node_body && let Some(node) = node.as_stream_scan() { node.stream_scan_type == StreamScanType::Backfill as i32 + || node.stream_scan_type == StreamScanType::ArrangementBackfill as i32 } else { false }; diff --git a/src/meta/src/stream/stream_graph/actor.rs b/src/meta/src/stream/stream_graph/actor.rs index e5579552989ce..8c0472aae14f1 100644 --- a/src/meta/src/stream/stream_graph/actor.rs +++ b/src/meta/src/stream/stream_graph/actor.rs @@ -174,6 +174,7 @@ impl ActorBuilder { downstream_fragment_id: self.fragment_id, }]; + // FIXME(kwannoel): This may not hold for Arrangement Backfill. // As we always use the `NoShuffle` exchange for MV on MV, there should be only one // upstream. let upstream_actor_id = upstreams.actors.as_global_ids(); diff --git a/src/object_store/src/object/mod.rs b/src/object_store/src/object/mod.rs index da846b7136a46..8e1ec690dd194 100644 --- a/src/object_store/src/object/mod.rs +++ b/src/object_store/src/object/mod.rs @@ -621,7 +621,11 @@ impl MonitoredObjectStore { .unwrap_or_else(|_| Err(ObjectError::internal("read timeout"))), }; - try_update_failure_metric(&self.object_store_metrics, &res, operation_type); + if let Err(e) = &res && e.is_object_not_found_error() && path.ends_with("manifest.json") { + // Metadata backup's manifest.json not found is expected. + } else { + try_update_failure_metric(&self.object_store_metrics, &res, operation_type); + } let data = res?; self.object_store_metrics diff --git a/src/prost/build.rs b/src/prost/build.rs index dcc4257e627b4..5b8ddda59e098 100644 --- a/src/prost/build.rs +++ b/src/prost/build.rs @@ -116,6 +116,7 @@ fn main() -> Result<(), Box> { .type_attribute("plan_common.ColumnDesc", "#[derive(Eq, Hash)]") .type_attribute("common.ColumnOrder", "#[derive(Eq, Hash)]") .type_attribute("common.OrderType", "#[derive(Eq, Hash)]") + .type_attribute("common.Buffer", "#[derive(Eq)]") // Eq is required to derive `FromJsonQueryResult` for models in risingwave_meta_model_v2. .type_attribute("hummock.TableStats", "#[derive(Eq)]") .type_attribute("hummock.SstableInfo", "#[derive(Eq)]") @@ -133,6 +134,12 @@ fn main() -> Result<(), Box> { .type_attribute("hummock.TableOption", "#[derive(Eq)]") .type_attribute("hummock.InputLevel", "#[derive(Eq)]") .type_attribute("hummock.CompactTask", "#[derive(Eq)]") + .type_attribute("hummock.TableWatermarks", "#[derive(Eq)]") + .type_attribute("hummock.VnodeWatermark", "#[derive(Eq)]") + .type_attribute( + "hummock.TableWatermarks.EpochNewWatermarks", + "#[derive(Eq)]", + ) // =================== .out_dir(out_dir.as_path()) .compile(&protos, &[proto_dir.to_string()]) diff --git a/src/sqlparser/src/ast/legacy_source.rs b/src/sqlparser/src/ast/legacy_source.rs new file mode 100644 index 0000000000000..dbc25d1b927e2 --- /dev/null +++ b/src/sqlparser/src/ast/legacy_source.rs @@ -0,0 +1,430 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Content of this file can be deleted once we stop supporting `create source` syntax v1. +//! New features shall NOT touch this file. + +use std::fmt; + +use itertools::Itertools as _; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use crate::ast::{ + AstString, AstVec, ConnectorSchema, Encode, Format, Ident, ObjectName, ParseTo, SqlOption, + Value, +}; +use crate::keywords::Keyword; +use crate::parser::{Parser, ParserError}; +use crate::{impl_fmt_display, impl_parse_to}; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub enum CompatibleSourceSchema { + RowFormat(SourceSchema), + V2(ConnectorSchema), +} + +impl fmt::Display for CompatibleSourceSchema { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + CompatibleSourceSchema::RowFormat(inner) => { + write!(f, "{}", inner) + } + CompatibleSourceSchema::V2(inner) => { + write!(f, "{}", inner) + } + } + } +} + +impl CompatibleSourceSchema { + pub(crate) fn into_v2(self) -> ConnectorSchema { + match self { + CompatibleSourceSchema::RowFormat(inner) => inner.into_source_schema_v2(), + CompatibleSourceSchema::V2(inner) => inner, + } + } +} + +impl From for CompatibleSourceSchema { + fn from(value: ConnectorSchema) -> Self { + Self::V2(value) + } +} + +pub fn parse_source_schema(p: &mut Parser) -> Result { + if let Some(schema_v2) = p.parse_schema()? { + Ok(CompatibleSourceSchema::V2(schema_v2)) + } else if p.peek_nth_any_of_keywords(0, &[Keyword::ROW]) + && p.peek_nth_any_of_keywords(1, &[Keyword::FORMAT]) + { + p.expect_keyword(Keyword::ROW)?; + p.expect_keyword(Keyword::FORMAT)?; + let id = p.parse_identifier()?; + let value = id.value.to_ascii_uppercase(); + let schema = match &value[..] { + "JSON" => SourceSchema::Json, + "UPSERT_JSON" => SourceSchema::UpsertJson, + "PROTOBUF" => { + impl_parse_to!(protobuf_schema: ProtobufSchema, p); + SourceSchema::Protobuf(protobuf_schema) + } + "DEBEZIUM_JSON" => SourceSchema::DebeziumJson, + "DEBEZIUM_MONGO_JSON" => SourceSchema::DebeziumMongoJson, + "AVRO" => { + impl_parse_to!(avro_schema: AvroSchema, p); + SourceSchema::Avro(avro_schema) + } + "UPSERT_AVRO" => { + impl_parse_to!(avro_schema: AvroSchema, p); + SourceSchema::UpsertAvro(avro_schema) + } + "MAXWELL" => SourceSchema::Maxwell, + "CANAL_JSON" => SourceSchema::CanalJson, + "CSV" => { + impl_parse_to!(csv_info: CsvInfo, p); + SourceSchema::Csv(csv_info) + } + "NATIVE" => SourceSchema::Native, // used internally by schema change + "DEBEZIUM_AVRO" => { + impl_parse_to!(avro_schema: DebeziumAvroSchema, p); + SourceSchema::DebeziumAvro(avro_schema) + } + "BYTES" => SourceSchema::Bytes, + _ => { + return Err(ParserError::ParserError( + "expected JSON | UPSERT_JSON | PROTOBUF | DEBEZIUM_JSON | DEBEZIUM_AVRO \ + | AVRO | UPSERT_AVRO | MAXWELL | CANAL_JSON | BYTES | NATIVE after ROW FORMAT" + .to_string(), + )) + } + }; + Ok(CompatibleSourceSchema::RowFormat(schema)) + } else { + p.expected("description of the format", p.peek_token()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub enum SourceSchema { + Protobuf(ProtobufSchema), + // Keyword::PROTOBUF ProtobufSchema + Json, // Keyword::JSON + DebeziumJson, // Keyword::DEBEZIUM_JSON + DebeziumMongoJson, + UpsertJson, // Keyword::UPSERT_JSON + Avro(AvroSchema), // Keyword::AVRO + UpsertAvro(AvroSchema), // Keyword::UpsertAVRO + Maxwell, // Keyword::MAXWELL + CanalJson, // Keyword::CANAL_JSON + Csv(CsvInfo), // Keyword::CSV + Native, + DebeziumAvro(DebeziumAvroSchema), // Keyword::DEBEZIUM_AVRO + Bytes, +} + +impl SourceSchema { + pub fn into_source_schema_v2(self) -> ConnectorSchema { + let (format, row_encode) = match self { + SourceSchema::Protobuf(_) => (Format::Plain, Encode::Protobuf), + SourceSchema::Json => (Format::Plain, Encode::Json), + SourceSchema::DebeziumJson => (Format::Debezium, Encode::Json), + SourceSchema::DebeziumMongoJson => (Format::DebeziumMongo, Encode::Json), + SourceSchema::UpsertJson => (Format::Upsert, Encode::Json), + SourceSchema::Avro(_) => (Format::Plain, Encode::Avro), + SourceSchema::UpsertAvro(_) => (Format::Upsert, Encode::Avro), + SourceSchema::Maxwell => (Format::Maxwell, Encode::Json), + SourceSchema::CanalJson => (Format::Canal, Encode::Json), + SourceSchema::Csv(_) => (Format::Plain, Encode::Csv), + SourceSchema::DebeziumAvro(_) => (Format::Debezium, Encode::Avro), + SourceSchema::Bytes => (Format::Plain, Encode::Bytes), + SourceSchema::Native => (Format::Native, Encode::Native), + }; + + let row_options = match self { + SourceSchema::Protobuf(schema) => { + let mut options = vec![SqlOption { + name: ObjectName(vec![Ident { + value: "message".into(), + quote_style: None, + }]), + value: Value::SingleQuotedString(schema.message_name.0), + }]; + if schema.use_schema_registry { + options.push(SqlOption { + name: ObjectName(vec![Ident { + value: "schema.registry".into(), + quote_style: None, + }]), + value: Value::SingleQuotedString(schema.row_schema_location.0), + }); + } else { + options.push(SqlOption { + name: ObjectName(vec![Ident { + value: "schema.location".into(), + quote_style: None, + }]), + value: Value::SingleQuotedString(schema.row_schema_location.0), + }) + } + options + } + SourceSchema::Avro(schema) | SourceSchema::UpsertAvro(schema) => { + if schema.use_schema_registry { + vec![SqlOption { + name: ObjectName(vec![Ident { + value: "schema.registry".into(), + quote_style: None, + }]), + value: Value::SingleQuotedString(schema.row_schema_location.0), + }] + } else { + vec![SqlOption { + name: ObjectName(vec![Ident { + value: "schema.location".into(), + quote_style: None, + }]), + value: Value::SingleQuotedString(schema.row_schema_location.0), + }] + } + } + SourceSchema::DebeziumAvro(schema) => { + vec![SqlOption { + name: ObjectName(vec![Ident { + value: "schema.registry".into(), + quote_style: None, + }]), + value: Value::SingleQuotedString(schema.row_schema_location.0), + }] + } + SourceSchema::Csv(schema) => { + vec![ + SqlOption { + name: ObjectName(vec![Ident { + value: "delimiter".into(), + quote_style: None, + }]), + value: Value::SingleQuotedString( + String::from_utf8_lossy(&[schema.delimiter]).into(), + ), + }, + SqlOption { + name: ObjectName(vec![Ident { + value: "without_header".into(), + quote_style: None, + }]), + value: Value::SingleQuotedString(if schema.has_header { + "false".into() + } else { + "true".into() + }), + }, + ] + } + _ => vec![], + }; + + ConnectorSchema { + format, + row_encode, + row_options, + } + } +} + +impl fmt::Display for SourceSchema { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "ROW FORMAT ")?; + match self { + SourceSchema::Protobuf(protobuf_schema) => write!(f, "PROTOBUF {}", protobuf_schema), + SourceSchema::Json => write!(f, "JSON"), + SourceSchema::UpsertJson => write!(f, "UPSERT_JSON"), + SourceSchema::Maxwell => write!(f, "MAXWELL"), + SourceSchema::DebeziumJson => write!(f, "DEBEZIUM_JSON"), + SourceSchema::DebeziumMongoJson => write!(f, "DEBEZIUM_MONGO_JSON"), + SourceSchema::Avro(avro_schema) => write!(f, "AVRO {}", avro_schema), + SourceSchema::UpsertAvro(avro_schema) => write!(f, "UPSERT_AVRO {}", avro_schema), + SourceSchema::CanalJson => write!(f, "CANAL_JSON"), + SourceSchema::Csv(csv_info) => write!(f, "CSV {}", csv_info), + SourceSchema::Native => write!(f, "NATIVE"), + SourceSchema::DebeziumAvro(avro_schema) => write!(f, "DEBEZIUM_AVRO {}", avro_schema), + SourceSchema::Bytes => write!(f, "BYTES"), + } + } +} + +// sql_grammar!(ProtobufSchema { +// [Keyword::MESSAGE], +// message_name: AstString, +// [Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], +// row_schema_location: AstString, +// }); +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub struct ProtobufSchema { + pub message_name: AstString, + pub row_schema_location: AstString, + pub use_schema_registry: bool, +} + +impl ParseTo for ProtobufSchema { + fn parse_to(p: &mut Parser) -> Result { + impl_parse_to!([Keyword::MESSAGE], p); + impl_parse_to!(message_name: AstString, p); + impl_parse_to!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], p); + impl_parse_to!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], p); + impl_parse_to!(row_schema_location: AstString, p); + Ok(Self { + message_name, + row_schema_location, + use_schema_registry, + }) + } +} + +impl fmt::Display for ProtobufSchema { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut v: Vec = vec![]; + impl_fmt_display!([Keyword::MESSAGE], v); + impl_fmt_display!(message_name, v, self); + impl_fmt_display!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], v); + impl_fmt_display!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], v, self); + impl_fmt_display!(row_schema_location, v, self); + v.iter().join(" ").fmt(f) + } +} + +// sql_grammar!(AvroSchema { +// [Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION, [Keyword::CONFLUENT, Keyword::SCHEMA, +// Keyword::REGISTRY]], row_schema_location: AstString, +// }); +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub struct AvroSchema { + pub row_schema_location: AstString, + pub use_schema_registry: bool, +} +impl ParseTo for AvroSchema { + fn parse_to(p: &mut Parser) -> Result { + impl_parse_to!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], p); + impl_parse_to!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], p); + impl_parse_to!(row_schema_location: AstString, p); + Ok(Self { + row_schema_location, + use_schema_registry, + }) + } +} + +impl fmt::Display for AvroSchema { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut v: Vec = vec![]; + impl_fmt_display!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], v); + impl_fmt_display!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], v, self); + impl_fmt_display!(row_schema_location, v, self); + v.iter().join(" ").fmt(f) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub struct DebeziumAvroSchema { + pub row_schema_location: AstString, +} + +impl fmt::Display for DebeziumAvroSchema { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut v: Vec = vec![]; + impl_fmt_display!( + [ + Keyword::ROW, + Keyword::SCHEMA, + Keyword::LOCATION, + Keyword::CONFLUENT, + Keyword::SCHEMA, + Keyword::REGISTRY + ], + v + ); + impl_fmt_display!(row_schema_location, v, self); + v.iter().join(" ").fmt(f) + } +} + +impl ParseTo for DebeziumAvroSchema { + fn parse_to(p: &mut Parser) -> Result { + impl_parse_to!( + [ + Keyword::ROW, + Keyword::SCHEMA, + Keyword::LOCATION, + Keyword::CONFLUENT, + Keyword::SCHEMA, + Keyword::REGISTRY + ], + p + ); + impl_parse_to!(row_schema_location: AstString, p); + Ok(Self { + row_schema_location, + }) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub struct CsvInfo { + pub delimiter: u8, + pub has_header: bool, +} + +pub fn get_delimiter(chars: &str) -> Result { + match chars { + "," => Ok(b','), // comma + "\t" => Ok(b'\t'), // tab + other => Err(ParserError::ParserError(format!( + "The delimiter should be one of ',', E'\\t', but got {:?}", + other + ))), + } +} + +impl ParseTo for CsvInfo { + fn parse_to(p: &mut Parser) -> Result { + impl_parse_to!(without_header => [Keyword::WITHOUT, Keyword::HEADER], p); + impl_parse_to!([Keyword::DELIMITED, Keyword::BY], p); + impl_parse_to!(delimiter: AstString, p); + let delimiter = get_delimiter(delimiter.0.as_str())?; + Ok(Self { + delimiter, + has_header: !without_header, + }) + } +} + +impl fmt::Display for CsvInfo { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut v: Vec = vec![]; + if !self.has_header { + v.push(format!( + "{}", + AstVec([Keyword::WITHOUT, Keyword::HEADER].to_vec()) + )); + } + impl_fmt_display!(delimiter, v, self); + v.iter().join(" ").fmt(f) + } +} diff --git a/src/sqlparser/src/ast/mod.rs b/src/sqlparser/src/ast/mod.rs index a57a6a9175ebd..4ccfaf0ee8f90 100644 --- a/src/sqlparser/src/ast/mod.rs +++ b/src/sqlparser/src/ast/mod.rs @@ -13,6 +13,7 @@ //! SQL Abstract Syntax Tree (AST) types mod data_type; pub(crate) mod ddl; +mod legacy_source; mod operator; mod query; mod statement; @@ -36,6 +37,9 @@ pub use self::ddl::{ AlterSchemaOperation, AlterTableOperation, ColumnDef, ColumnOption, ColumnOptionDef, ReferentialAction, SourceWatermark, TableConstraint, }; +pub use self::legacy_source::{ + get_delimiter, AvroSchema, CompatibleSourceSchema, DebeziumAvroSchema, ProtobufSchema, +}; pub use self::operator::{BinaryOperator, QualifiedOperator, UnaryOperator}; pub use self::query::{ Cte, Distinct, Fetch, Join, JoinConstraint, JoinOperator, LateralView, OrderByExpr, Query, diff --git a/src/sqlparser/src/ast/statement.rs b/src/sqlparser/src/ast/statement.rs index 133688875fd6e..f50a6a1c45450 100644 --- a/src/sqlparser/src/ast/statement.rs +++ b/src/sqlparser/src/ast/statement.rs @@ -20,7 +20,8 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; use super::ddl::SourceWatermark; -use super::{EmitMode, Ident, ObjectType, Query, Value}; +use super::legacy_source::{parse_source_schema, CompatibleSourceSchema}; +use super::{EmitMode, Ident, ObjectType, Query}; use crate::ast::{ display_comma_separated, display_separated, ColumnDef, ObjectName, SqlOption, TableConstraint, }; @@ -33,6 +34,7 @@ pub trait ParseTo: Sized { fn parse_to(parser: &mut Parser) -> Result; } +#[macro_export] macro_rules! impl_parse_to { () => {}; ($field:ident : $field_type:ty, $parser:ident) => { @@ -46,6 +48,7 @@ macro_rules! impl_parse_to { }; } +#[macro_export] macro_rules! impl_fmt_display { () => {}; ($field:ident, $v:ident, $self:ident) => {{ @@ -84,155 +87,6 @@ pub struct CreateSourceStatement { pub source_watermarks: Vec, } -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub enum SourceSchema { - Protobuf(ProtobufSchema), - // Keyword::PROTOBUF ProtobufSchema - Json, // Keyword::JSON - DebeziumJson, // Keyword::DEBEZIUM_JSON - DebeziumMongoJson, - UpsertJson, // Keyword::UPSERT_JSON - Avro(AvroSchema), // Keyword::AVRO - UpsertAvro(AvroSchema), // Keyword::UpsertAVRO - Maxwell, // Keyword::MAXWELL - CanalJson, // Keyword::CANAL_JSON - Csv(CsvInfo), // Keyword::CSV - Native, - DebeziumAvro(DebeziumAvroSchema), // Keyword::DEBEZIUM_AVRO - Bytes, -} - -impl SourceSchema { - pub fn into_source_schema_v2(self) -> ConnectorSchema { - let (format, row_encode) = match self { - SourceSchema::Protobuf(_) => (Format::Plain, Encode::Protobuf), - SourceSchema::Json => (Format::Plain, Encode::Json), - SourceSchema::DebeziumJson => (Format::Debezium, Encode::Json), - SourceSchema::DebeziumMongoJson => (Format::DebeziumMongo, Encode::Json), - SourceSchema::UpsertJson => (Format::Upsert, Encode::Json), - SourceSchema::Avro(_) => (Format::Plain, Encode::Avro), - SourceSchema::UpsertAvro(_) => (Format::Upsert, Encode::Avro), - SourceSchema::Maxwell => (Format::Maxwell, Encode::Json), - SourceSchema::CanalJson => (Format::Canal, Encode::Json), - SourceSchema::Csv(_) => (Format::Plain, Encode::Csv), - SourceSchema::DebeziumAvro(_) => (Format::Debezium, Encode::Avro), - SourceSchema::Bytes => (Format::Plain, Encode::Bytes), - SourceSchema::Native => (Format::Native, Encode::Native), - }; - - let row_options = match self { - SourceSchema::Protobuf(schema) => { - let mut options = vec![SqlOption { - name: ObjectName(vec![Ident { - value: "message".into(), - quote_style: None, - }]), - value: Value::SingleQuotedString(schema.message_name.0), - }]; - if schema.use_schema_registry { - options.push(SqlOption { - name: ObjectName(vec![Ident { - value: "schema.registry".into(), - quote_style: None, - }]), - value: Value::SingleQuotedString(schema.row_schema_location.0), - }); - } else { - options.push(SqlOption { - name: ObjectName(vec![Ident { - value: "schema.location".into(), - quote_style: None, - }]), - value: Value::SingleQuotedString(schema.row_schema_location.0), - }) - } - options - } - SourceSchema::Avro(schema) | SourceSchema::UpsertAvro(schema) => { - if schema.use_schema_registry { - vec![SqlOption { - name: ObjectName(vec![Ident { - value: "schema.registry".into(), - quote_style: None, - }]), - value: Value::SingleQuotedString(schema.row_schema_location.0), - }] - } else { - vec![SqlOption { - name: ObjectName(vec![Ident { - value: "schema.location".into(), - quote_style: None, - }]), - value: Value::SingleQuotedString(schema.row_schema_location.0), - }] - } - } - SourceSchema::DebeziumAvro(schema) => { - vec![SqlOption { - name: ObjectName(vec![Ident { - value: "schema.registry".into(), - quote_style: None, - }]), - value: Value::SingleQuotedString(schema.row_schema_location.0), - }] - } - SourceSchema::Csv(schema) => { - vec![ - SqlOption { - name: ObjectName(vec![Ident { - value: "delimiter".into(), - quote_style: None, - }]), - value: Value::SingleQuotedString( - String::from_utf8_lossy(&[schema.delimiter]).into(), - ), - }, - SqlOption { - name: ObjectName(vec![Ident { - value: "without_header".into(), - quote_style: None, - }]), - value: Value::SingleQuotedString(if schema.has_header { - "false".into() - } else { - "true".into() - }), - }, - ] - } - _ => vec![], - }; - - ConnectorSchema { - format, - row_encode, - row_options, - } - } -} - -impl fmt::Display for SourceSchema { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "ROW FORMAT ")?; - match self { - SourceSchema::Protobuf(protobuf_schema) => write!(f, "PROTOBUF {}", protobuf_schema), - SourceSchema::Json => write!(f, "JSON"), - SourceSchema::UpsertJson => write!(f, "UPSERT_JSON"), - SourceSchema::Maxwell => write!(f, "MAXWELL"), - SourceSchema::DebeziumJson => write!(f, "DEBEZIUM_JSON"), - SourceSchema::DebeziumMongoJson => write!(f, "DEBEZIUM_MONGO_JSON"), - SourceSchema::Avro(avro_schema) => write!(f, "AVRO {}", avro_schema), - SourceSchema::UpsertAvro(avro_schema) => write!(f, "UPSERT_AVRO {}", avro_schema), - SourceSchema::CanalJson => write!(f, "CANAL_JSON"), - SourceSchema::Csv(csv_info) => write!(f, "CSV {}", csv_info), - SourceSchema::Native => write!(f, "NATIVE"), - SourceSchema::DebeziumAvro(avro_schema) => write!(f, "DEBEZIUM_AVRO {}", avro_schema), - SourceSchema::Bytes => write!(f, "BYTES"), - } - } -} - #[derive(Debug, Clone, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum Format { @@ -341,96 +195,6 @@ pub struct ConnectorSchema { pub row_options: Vec, } -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub enum CompatibleSourceSchema { - RowFormat(SourceSchema), - V2(ConnectorSchema), -} - -impl fmt::Display for CompatibleSourceSchema { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - CompatibleSourceSchema::RowFormat(inner) => { - write!(f, "{}", inner) - } - CompatibleSourceSchema::V2(inner) => { - write!(f, "{}", inner) - } - } - } -} - -impl CompatibleSourceSchema { - pub(crate) fn into_v2(self) -> ConnectorSchema { - match self { - CompatibleSourceSchema::RowFormat(inner) => inner.into_source_schema_v2(), - CompatibleSourceSchema::V2(inner) => inner, - } - } -} - -impl From for CompatibleSourceSchema { - fn from(value: ConnectorSchema) -> Self { - Self::V2(value) - } -} - -fn parse_source_schema(p: &mut Parser) -> Result { - if let Some(schema_v2) = p.parse_schema()? { - Ok(CompatibleSourceSchema::V2(schema_v2)) - } else if p.peek_nth_any_of_keywords(0, &[Keyword::ROW]) - && p.peek_nth_any_of_keywords(1, &[Keyword::FORMAT]) - { - p.expect_keyword(Keyword::ROW)?; - p.expect_keyword(Keyword::FORMAT)?; - let id = p.parse_identifier()?; - let value = id.value.to_ascii_uppercase(); - let schema = match &value[..] { - "JSON" => SourceSchema::Json, - "UPSERT_JSON" => SourceSchema::UpsertJson, - "PROTOBUF" => { - impl_parse_to!(protobuf_schema: ProtobufSchema, p); - SourceSchema::Protobuf(protobuf_schema) - } - "DEBEZIUM_JSON" => SourceSchema::DebeziumJson, - "DEBEZIUM_MONGO_JSON" => SourceSchema::DebeziumMongoJson, - "AVRO" => { - impl_parse_to!(avro_schema: AvroSchema, p); - SourceSchema::Avro(avro_schema) - } - "UPSERT_AVRO" => { - impl_parse_to!(avro_schema: AvroSchema, p); - SourceSchema::UpsertAvro(avro_schema) - } - "MAXWELL" => SourceSchema::Maxwell, - "CANAL_JSON" => SourceSchema::CanalJson, - "CSV" => { - impl_parse_to!(csv_info: CsvInfo, p); - SourceSchema::Csv(csv_info) - } - "NATIVE" => SourceSchema::Native, // used internally by schema change - "DEBEZIUM_AVRO" => { - impl_parse_to!(avro_schema: DebeziumAvroSchema, p); - SourceSchema::DebeziumAvro(avro_schema) - } - "BYTES" => SourceSchema::Bytes, - _ => { - return Err(ParserError::ParserError( - "expected JSON | UPSERT_JSON | PROTOBUF | DEBEZIUM_JSON | DEBEZIUM_AVRO \ - | AVRO | UPSERT_AVRO | MAXWELL | CANAL_JSON | BYTES | NATIVE after ROW FORMAT" - .to_string(), - )) - } - }; - Ok(CompatibleSourceSchema::RowFormat(schema)) - } else { - Err(ParserError::ParserError( - "expect description of the format".to_string(), - )) - } -} - impl Parser { /// Peek the next tokens to see if it is `FORMAT` or `ROW FORMAT` (for compatibility). fn peek_source_schema_format(&mut self) -> bool { @@ -554,169 +318,6 @@ impl fmt::Display for ConnectorSchema { } } -// sql_grammar!(ProtobufSchema { -// [Keyword::MESSAGE], -// message_name: AstString, -// [Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], -// row_schema_location: AstString, -// }); -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct ProtobufSchema { - pub message_name: AstString, - pub row_schema_location: AstString, - pub use_schema_registry: bool, -} - -impl ParseTo for ProtobufSchema { - fn parse_to(p: &mut Parser) -> Result { - impl_parse_to!([Keyword::MESSAGE], p); - impl_parse_to!(message_name: AstString, p); - impl_parse_to!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], p); - impl_parse_to!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], p); - impl_parse_to!(row_schema_location: AstString, p); - Ok(Self { - message_name, - row_schema_location, - use_schema_registry, - }) - } -} - -impl fmt::Display for ProtobufSchema { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let mut v: Vec = vec![]; - impl_fmt_display!([Keyword::MESSAGE], v); - impl_fmt_display!(message_name, v, self); - impl_fmt_display!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], v); - impl_fmt_display!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], v, self); - impl_fmt_display!(row_schema_location, v, self); - v.iter().join(" ").fmt(f) - } -} - -// sql_grammar!(AvroSchema { -// [Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION, [Keyword::CONFLUENT, Keyword::SCHEMA, -// Keyword::REGISTRY]], row_schema_location: AstString, -// }); -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct AvroSchema { - pub row_schema_location: AstString, - pub use_schema_registry: bool, -} -impl ParseTo for AvroSchema { - fn parse_to(p: &mut Parser) -> Result { - impl_parse_to!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], p); - impl_parse_to!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], p); - impl_parse_to!(row_schema_location: AstString, p); - Ok(Self { - row_schema_location, - use_schema_registry, - }) - } -} - -impl fmt::Display for AvroSchema { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let mut v: Vec = vec![]; - impl_fmt_display!([Keyword::ROW, Keyword::SCHEMA, Keyword::LOCATION], v); - impl_fmt_display!(use_schema_registry => [Keyword::CONFLUENT, Keyword::SCHEMA, Keyword::REGISTRY], v, self); - impl_fmt_display!(row_schema_location, v, self); - v.iter().join(" ").fmt(f) - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct DebeziumAvroSchema { - pub row_schema_location: AstString, -} - -impl fmt::Display for DebeziumAvroSchema { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let mut v: Vec = vec![]; - impl_fmt_display!( - [ - Keyword::ROW, - Keyword::SCHEMA, - Keyword::LOCATION, - Keyword::CONFLUENT, - Keyword::SCHEMA, - Keyword::REGISTRY - ], - v - ); - impl_fmt_display!(row_schema_location, v, self); - v.iter().join(" ").fmt(f) - } -} - -impl ParseTo for DebeziumAvroSchema { - fn parse_to(p: &mut Parser) -> Result { - impl_parse_to!( - [ - Keyword::ROW, - Keyword::SCHEMA, - Keyword::LOCATION, - Keyword::CONFLUENT, - Keyword::SCHEMA, - Keyword::REGISTRY - ], - p - ); - impl_parse_to!(row_schema_location: AstString, p); - Ok(Self { - row_schema_location, - }) - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct CsvInfo { - pub delimiter: u8, - pub has_header: bool, -} - -pub fn get_delimiter(chars: &str) -> Result { - match chars { - "," => Ok(b','), // comma - "\t" => Ok(b'\t'), // tab - other => Err(ParserError::ParserError(format!( - "The delimiter should be one of ',', E'\\t', but got {:?}", - other - ))), - } -} - -impl ParseTo for CsvInfo { - fn parse_to(p: &mut Parser) -> Result { - impl_parse_to!(without_header => [Keyword::WITHOUT, Keyword::HEADER], p); - impl_parse_to!([Keyword::DELIMITED, Keyword::BY], p); - impl_parse_to!(delimiter: AstString, p); - let delimiter = get_delimiter(delimiter.0.as_str())?; - Ok(Self { - delimiter, - has_header: !without_header, - }) - } -} - -impl fmt::Display for CsvInfo { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let mut v: Vec = vec![]; - if !self.has_header { - v.push(format!( - "{}", - AstVec([Keyword::WITHOUT, Keyword::HEADER].to_vec()) - )); - } - impl_fmt_display!(delimiter, v, self); - v.iter().join(" ").fmt(f) - } -} - impl ParseTo for CreateSourceStatement { fn parse_to(p: &mut Parser) -> Result { impl_parse_to!(if_not_exists => [Keyword::IF, Keyword::NOT, Keyword::EXISTS], p); diff --git a/src/sqlparser/src/parser.rs b/src/sqlparser/src/parser.rs index fcd3a3dff3e36..4eb80286cf8e0 100644 --- a/src/sqlparser/src/parser.rs +++ b/src/sqlparser/src/parser.rs @@ -2471,17 +2471,12 @@ impl Parser { let cdc_table_info = if self.parse_keyword(Keyword::FROM) { let source_name = self.parse_object_name()?; - if self.parse_keyword(Keyword::TABLE) { - let external_table_name = self.parse_literal_string()?; - Some(CdcTableInfo { - source_name, - external_table_name, - }) - } else { - return Err(ParserError::ParserError( - "Expect a TABLE clause on table created by CREATE TABLE FROM".to_string(), - )); - } + self.expect_keyword(Keyword::TABLE)?; + let external_table_name = self.parse_literal_string()?; + Some(CdcTableInfo { + source_name, + external_table_name, + }) } else { None }; diff --git a/src/sqlparser/tests/testdata/create.yaml b/src/sqlparser/tests/testdata/create.yaml index dd189960e213a..4da81a4c43325 100644 --- a/src/sqlparser/tests/testdata/create.yaml +++ b/src/sqlparser/tests/testdata/create.yaml @@ -16,13 +16,23 @@ - input: CREATE TABLE t (a INT, b INT) AS SELECT 1 AS b, 2 AS a formatted_sql: CREATE TABLE t (a INT, b INT) AS SELECT 1 AS b, 2 AS a - input: CREATE SOURCE src - error_msg: 'sql parser error: expect description of the format' + error_msg: |- + sql parser error: Expected description of the format, found: EOF at the end + Near "CREATE SOURCE src" +- input: CREATE SOURCE src-a FORMAT PLAIN ENCODE JSON + error_msg: |- + sql parser error: Expected description of the format, found: - at line:1, column:19 + Near "CREATE SOURCE src" - input: CREATE SOURCE src FORMAT PLAIN ENCODE JSON formatted_sql: CREATE SOURCE src FORMAT PLAIN ENCODE JSON - input: CREATE SOURCE mysql_src with ( connector = 'mysql-cdc', hostname = 'localhost', port = '3306', database.name = 'mytest', server.id = '5601' ) formatted_sql: CREATE SOURCE mysql_src WITH (connector = 'mysql-cdc', hostname = 'localhost', port = '3306', database.name = 'mytest', server.id = '5601') FORMAT PLAIN ENCODE JSON - input: CREATE TABLE sbtest10 (id INT PRIMARY KEY, k INT, c CHARACTER VARYING, pad CHARACTER VARYING) FROM sbtest TABLE 'mydb.sbtest10' formatted_sql: CREATE TABLE sbtest10 (id INT PRIMARY KEY, k INT, c CHARACTER VARYING, pad CHARACTER VARYING) FROM sbtest TABLE 'mydb.sbtest10' +- input: CREATE TABLE sbtest10 (id INT PRIMARY KEY, k INT, c CHARACTER VARYING, pad CHARACTER VARYING) FROM sbtest + error_msg: |- + sql parser error: Expected TABLE, found: EOF at the end + Near "pad CHARACTER VARYING) FROM sbtest" - input: CREATE SOURCE IF NOT EXISTS src WITH (kafka.topic = 'abc', kafka.servers = 'localhost:1001') FORMAT PLAIN ENCODE PROTOBUF (message = 'Foo', schema.location = 'file://') formatted_sql: CREATE SOURCE IF NOT EXISTS src WITH (kafka.topic = 'abc', kafka.servers = 'localhost:1001') FORMAT PLAIN ENCODE PROTOBUF (message = 'Foo', schema.location = 'file://') formatted_ast: 'CreateSource { stmt: CreateSourceStatement { if_not_exists: true, columns: [], constraints: [], source_name: ObjectName([Ident { value: "src", quote_style: None }]), with_properties: WithProperties([SqlOption { name: ObjectName([Ident { value: "kafka", quote_style: None }, Ident { value: "topic", quote_style: None }]), value: SingleQuotedString("abc") }, SqlOption { name: ObjectName([Ident { value: "kafka", quote_style: None }, Ident { value: "servers", quote_style: None }]), value: SingleQuotedString("localhost:1001") }]), source_schema: V2(ConnectorSchema { format: Plain, row_encode: Protobuf, row_options: [SqlOption { name: ObjectName([Ident { value: "message", quote_style: None }]), value: SingleQuotedString("Foo") }, SqlOption { name: ObjectName([Ident { value: "schema", quote_style: None }, Ident { value: "location", quote_style: None }]), value: SingleQuotedString("file://") }] }), source_watermarks: [] } }' diff --git a/src/storage/Cargo.toml b/src/storage/Cargo.toml index 0c1045fac230c..07bb7fd528890 100644 --- a/src/storage/Cargo.toml +++ b/src/storage/Cargo.toml @@ -89,6 +89,7 @@ workspace-hack = { path = "../workspace-hack" } [dev-dependencies] criterion = { workspace = true, features = ["async_futures"] } +expect-test = "1" moka = { version = "0.12", features = ["future"] } risingwave_hummock_sdk = { workspace = true, features = ["enable_test_epoch"] } risingwave_test_runner = { workspace = true } diff --git a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs index a86a31769f28f..1a85bea02b504 100644 --- a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs +++ b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs @@ -24,7 +24,7 @@ use risingwave_pb::hummock::hummock_version_delta::GroupDeltas; use risingwave_pb::hummock::{ CompactionConfig, CompatibilityVersion, GroupConstruct, GroupDestroy, GroupMetaChange, GroupTableChange, HummockVersion, HummockVersionDelta, Level, LevelType, OverlappingLevel, - PbLevelType, SstableInfo, + PbLevelType, PbTableWatermarks, SstableInfo, }; use tracing::warn; @@ -189,6 +189,47 @@ impl HummockVersion { .map(|group| group.levels.len() + 1) .unwrap_or(0) } + + pub fn safe_epoch_table_watermarks( + &self, + existing_table_ids: &[u32], + ) -> BTreeMap { + fn extract_single_table_watermark( + table_watermarks: &PbTableWatermarks, + safe_epoch: u64, + ) -> Option { + if let Some(first_epoch_watermark) = table_watermarks.epoch_watermarks.first() { + assert!( + first_epoch_watermark.epoch >= safe_epoch, + "smallest epoch {} in table watermark should be at least safe epoch {}", + first_epoch_watermark.epoch, + safe_epoch + ); + if first_epoch_watermark.epoch == safe_epoch { + Some(PbTableWatermarks { + epoch_watermarks: vec![first_epoch_watermark.clone()], + is_ascending: table_watermarks.is_ascending, + }) + } else { + None + } + } else { + None + } + } + self.table_watermarks + .iter() + .filter_map(|(table_id, table_watermarks)| { + let u32_table_id = *table_id as _; + if !existing_table_ids.contains(&u32_table_id) { + None + } else { + extract_single_table_watermark(table_watermarks, self.safe_epoch) + .map(|table_watermarks| (*table_id, table_watermarks)) + } + }) + .collect() + } } pub type SstSplitInfo = ( diff --git a/src/storage/hummock_sdk/src/key.rs b/src/storage/hummock_sdk/src/key.rs index ba2b55a5e7849..a2fb4ef99a0cc 100644 --- a/src/storage/hummock_sdk/src/key.rs +++ b/src/storage/hummock_sdk/src/key.rs @@ -510,7 +510,9 @@ impl> UserKey { } /// Encode in to a buffer. - pub fn encode_length_prefixed(&self, buf: &mut impl BufMut) { + /// + /// length prefixed requires 4B more than its `encoded_len()` + pub fn encode_length_prefixed(&self, mut buf: impl BufMut) { buf.put_u32(self.table_id.table_id()); buf.put_u32(self.table_key.as_ref().len() as u32); buf.put_slice(self.table_key.as_ref()); diff --git a/src/storage/hummock_sdk/src/table_watermark.rs b/src/storage/hummock_sdk/src/table_watermark.rs index db38eedd6a06d..cd427832f4aaf 100644 --- a/src/storage/hummock_sdk/src/table_watermark.rs +++ b/src/storage/hummock_sdk/src/table_watermark.rs @@ -156,9 +156,9 @@ impl TableWatermarks { } pub fn merge_multiple_new_table_watermarks( - table_watermarks_list: impl IntoIterator>, -) -> HashMap { - let mut ret: HashMap)> = HashMap::new(); + table_watermarks_list: impl IntoIterator>, +) -> HashMap { + let mut ret: HashMap)> = HashMap::new(); for table_watermarks in table_watermarks_list { for (table_id, new_table_watermarks) in table_watermarks { let epoch_watermarks = match ret.entry(table_id) { diff --git a/src/storage/hummock_test/src/test_utils.rs b/src/storage/hummock_test/src/test_utils.rs index 3b7d6701ed886..e06f798fc76a9 100644 --- a/src/storage/hummock_test/src/test_utils.rs +++ b/src/storage/hummock_test/src/test_utils.rs @@ -262,9 +262,7 @@ impl HummockTestEnv { res.uncommitted_ssts, res.table_watermarks .into_iter() - .map(|(table_id, watermark)| { - (table_id.table_id as u64, watermark.to_protobuf()) - }) + .map(|(table_id, watermark)| (table_id.table_id, watermark.to_protobuf())) .collect(), ) .await diff --git a/src/storage/src/hummock/compactor/compactor_runner.rs b/src/storage/src/hummock/compactor/compactor_runner.rs index a137b1f101a6a..47443a3a6fee9 100644 --- a/src/storage/src/hummock/compactor/compactor_runner.rs +++ b/src/storage/src/hummock/compactor/compactor_runner.rs @@ -43,7 +43,8 @@ use crate::hummock::compactor::{ fast_compactor_runner, CompactOutput, CompactionFilter, Compactor, CompactorContext, }; use crate::hummock::iterator::{ - Forward, ForwardMergeRangeIterator, HummockIterator, UnorderedMergeIteratorInner, + Forward, ForwardMergeRangeIterator, HummockIterator, SkipWatermarkIterator, + UnorderedMergeIteratorInner, }; use crate::hummock::multi_builder::{CapacitySplitTableBuilder, TableBuilderFactory}; use crate::hummock::value::HummockValue; @@ -224,8 +225,14 @@ impl CompactorRunner { } } } + + // The `SkipWatermarkIterator` is used to handle the table watermark state cleaning introced + // in https://github.com/risingwavelabs/risingwave/issues/13148 Ok(( - UnorderedMergeIteratorInner::for_compactor(table_iters), + SkipWatermarkIterator::from_safe_epoch_watermarks( + UnorderedMergeIteratorInner::for_compactor(table_iters), + &self.compact_task.table_watermarks, + ), CompactionDeleteRangeIterator::new(del_iter), )) } diff --git a/src/storage/src/hummock/iterator/skip_watermark.rs b/src/storage/src/hummock/iterator/skip_watermark.rs index 58180ff356fc7..09644b2ab7475 100644 --- a/src/storage/src/hummock/iterator/skip_watermark.rs +++ b/src/storage/src/hummock/iterator/skip_watermark.rs @@ -16,10 +16,12 @@ use std::cmp::Ordering; use std::collections::{BTreeMap, VecDeque}; use bytes::Bytes; +use risingwave_common::buffer::Bitmap; use risingwave_common::catalog::TableId; -use risingwave_common::hash::VirtualNode; +use risingwave_common::hash::{VirtualNode, VnodeBitmapExt}; use risingwave_hummock_sdk::key::FullKey; use risingwave_hummock_sdk::table_watermark::{ReadTableWatermark, WatermarkDirection}; +use risingwave_pb::hummock::PbTableWatermarks; use crate::hummock::iterator::{Forward, HummockIterator}; use crate::hummock::value::HummockValue; @@ -41,6 +43,51 @@ impl> SkipWatermarkIterator { } } + pub fn from_safe_epoch_watermarks( + inner: I, + safe_epoch_watermarks: &BTreeMap, + ) -> Self { + let watermarks = safe_epoch_watermarks + .iter() + .map(|(table_id, watermarks)| { + assert_eq!(watermarks.epoch_watermarks.len(), 1); + let vnode_watermarks = &watermarks + .epoch_watermarks + .first() + .expect("should exist") + .watermarks; + let mut vnode_watermark_map = BTreeMap::new(); + for vnode_watermark in vnode_watermarks { + let watermark = Bytes::copy_from_slice(&vnode_watermark.watermark); + for vnode in + Bitmap::from(vnode_watermark.vnode_bitmap.as_ref().expect("should exist")) + .iter_vnodes() + { + assert!( + vnode_watermark_map + .insert(vnode, watermark.clone()) + .is_none(), + "duplicate table watermark on vnode {}", + vnode.to_index() + ); + } + } + ( + TableId::from(*table_id), + ReadTableWatermark { + direction: if watermarks.is_ascending { + WatermarkDirection::Ascending + } else { + WatermarkDirection::Descending + }, + vnode_watermarks: vnode_watermark_map, + }, + ) + }) + .collect(); + Self::new(inner, watermarks) + } + fn reset_watermark(&mut self) { self.remain_watermarks = self .watermarks diff --git a/src/storage/src/hummock/sstable/mod.rs b/src/storage/src/hummock/sstable/mod.rs index 65c38c68c3bc2..039e7962f2d7c 100644 --- a/src/storage/src/hummock/sstable/mod.rs +++ b/src/storage/src/hummock/sstable/mod.rs @@ -65,7 +65,6 @@ use super::{HummockError, HummockResult}; use crate::hummock::CachePolicy; use crate::store::ReadOptions; -const DEFAULT_META_BUFFER_CAPACITY: usize = 4096; const MAGIC: u32 = 0x5785ab73; const OLD_VERSION: u32 = 1; const VERSION: u32 = 2; @@ -164,8 +163,10 @@ impl MonotonicDeleteEvent { } } - pub fn encode(&self, buf: &mut Vec) { - self.event_key.left_user_key.encode_length_prefixed(buf); + pub fn encode(&self, mut buf: impl BufMut) { + self.event_key + .left_user_key + .encode_length_prefixed(&mut buf); buf.put_u8(if self.event_key.is_exclude_left_key { 1 } else { @@ -191,6 +192,7 @@ impl MonotonicDeleteEvent { #[inline] pub fn encoded_size(&self) -> usize { + // length prefixed requires 4B more than its `encoded_len()` 4 + self.event_key.left_user_key.encoded_len() + 1 + 8 } } @@ -292,7 +294,7 @@ impl BlockMeta { /// ```plain /// | offset (4B) | len (4B) | uncompressed size (4B) | smallest key len (4B) | smallest key | /// ``` - pub fn encode(&self, buf: &mut Vec) { + pub fn encode(&self, mut buf: impl BufMut) { buf.put_u32_le(self.offset); buf.put_u32_le(self.len); buf.put_u32_le(self.uncompressed_size); @@ -389,13 +391,15 @@ impl SstableMeta { /// | checksum (8B) | version (4B) | magic (4B) | /// ``` pub fn encode_to_bytes(&self) -> Vec { - let mut buf = Vec::with_capacity(DEFAULT_META_BUFFER_CAPACITY); + let encoded_size = self.encoded_size(); + let mut buf = Vec::with_capacity(encoded_size); self.encode_to(&mut buf); buf } - pub fn encode_to(&self, buf: &mut Vec) { - let start_offset = buf.len(); + pub fn encode_to(&self, mut buf: impl BufMut + AsRef<[u8]>) { + let start = buf.as_ref().len(); + buf.put_u32_le( utils::checked_into_u32(self.block_metas.len()).unwrap_or_else(|_| { let tmp_full_key = FullKey::decode(&self.smallest_key); @@ -407,13 +411,13 @@ impl SstableMeta { }), ); for block_meta in &self.block_metas { - block_meta.encode(buf); + block_meta.encode(&mut buf); } - put_length_prefixed_slice(buf, &self.bloom_filter); + put_length_prefixed_slice(&mut buf, &self.bloom_filter); buf.put_u32_le(self.estimated_size); buf.put_u32_le(self.key_count); - put_length_prefixed_slice(buf, &self.smallest_key); - put_length_prefixed_slice(buf, &self.largest_key); + put_length_prefixed_slice(&mut buf, &self.smallest_key); + put_length_prefixed_slice(&mut buf, &self.largest_key); buf.put_u32_le( utils::checked_into_u32(self.monotonic_tombstone_events.len()).unwrap_or_else(|_| { let tmp_full_key = FullKey::decode(&self.smallest_key); @@ -425,10 +429,13 @@ impl SstableMeta { }), ); for monotonic_tombstone_event in &self.monotonic_tombstone_events { - monotonic_tombstone_event.encode(buf); + monotonic_tombstone_event.encode(&mut buf); } buf.put_u64_le(self.meta_offset); - let checksum = xxhash64_checksum(&buf[start_offset..]); + + let end = buf.as_ref().len(); + + let checksum = xxhash64_checksum(&buf.as_ref()[start..end]); buf.put_u64_le(checksum); buf.put_u32_le(VERSION); buf.put_u32_le(MAGIC); diff --git a/src/storage/src/hummock/sstable/utils.rs b/src/storage/src/hummock/sstable/utils.rs index b754b17f4a3dc..920dd2c75b611 100644 --- a/src/storage/src/hummock/sstable/utils.rs +++ b/src/storage/src/hummock/sstable/utils.rs @@ -71,7 +71,7 @@ pub fn xxhash64_verify(data: &[u8], checksum: u64) -> HummockResult<()> { use bytes::{Buf, BufMut}; -pub fn put_length_prefixed_slice(buf: &mut Vec, slice: &[u8]) { +pub fn put_length_prefixed_slice(mut buf: impl BufMut, slice: &[u8]) { let len = checked_into_u32(slice.len()) .unwrap_or_else(|_| panic!("WARN overflow can't convert slice {} into u32", slice.len())); buf.put_u32_le(len); diff --git a/src/storage/src/row_serde/mod.rs b/src/storage/src/row_serde/mod.rs index cac6fa320ea80..5fc99b8b6945a 100644 --- a/src/storage/src/row_serde/mod.rs +++ b/src/storage/src/row_serde/mod.rs @@ -19,7 +19,7 @@ pub mod row_serde_util; pub mod value_serde; -/// Find out the [`ColumnDesc`] by a list of [`ColumnId`]. +/// Find out the [`ColumnDesc`] selected with a list of [`ColumnId`]. /// /// # Returns /// @@ -57,3 +57,97 @@ impl ColumnMapping { origin_row.project(&self.output_indices) } } + +#[cfg(test)] +mod test { + use std::fmt::Debug; + + use expect_test::{expect, Expect}; + use risingwave_common::types::DataType; + + use super::*; + + fn check(actual: impl Debug, expect: Expect) { + let actual = format!("{:#?}", actual); + expect.assert_eq(&actual); + } + + #[test] + fn test_find_columns_by_ids() { + let table_columns = vec![ + ColumnDesc::unnamed(1.into(), DataType::Varchar), + ColumnDesc::unnamed(2.into(), DataType::Int64), + ColumnDesc::unnamed(3.into(), DataType::Int16), + ]; + let column_ids = vec![2.into(), 3.into()]; + let result = find_columns_by_ids(&table_columns, &column_ids); + check( + result, + expect![[r#" + ( + [ + ColumnDesc { + data_type: Int64, + column_id: #2, + name: "", + field_descs: [], + type_name: "", + generated_or_default_column: None, + description: None, + }, + ColumnDesc { + data_type: Int16, + column_id: #3, + name: "", + field_descs: [], + type_name: "", + generated_or_default_column: None, + description: None, + }, + ], + [ + 1, + 2, + ], + )"#]], + ); + + let table_columns = vec![ + ColumnDesc::unnamed(2.into(), DataType::Int64), + ColumnDesc::unnamed(1.into(), DataType::Varchar), + ColumnDesc::unnamed(3.into(), DataType::Int16), + ]; + let column_ids = vec![2.into(), 1.into()]; + let result = find_columns_by_ids(&table_columns, &column_ids); + check( + result, + expect![[r#" + ( + [ + ColumnDesc { + data_type: Int64, + column_id: #2, + name: "", + field_descs: [], + type_name: "", + generated_or_default_column: None, + description: None, + }, + ColumnDesc { + data_type: Varchar, + column_id: #1, + name: "", + field_descs: [], + type_name: "", + generated_or_default_column: None, + description: None, + }, + ], + [ + 0, + 1, + ], + )"#]], + ); + } +} diff --git a/src/storage/src/table/batch_table/storage_table.rs b/src/storage/src/table/batch_table/storage_table.rs index 6edbf68ec7427..24d64076aa9ac 100644 --- a/src/storage/src/table/batch_table/storage_table.rs +++ b/src/storage/src/table/batch_table/storage_table.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::default::Default; use std::ops::Bound::{self, Excluded, Included, Unbounded}; use std::ops::{Index, RangeBounds}; use std::sync::Arc; @@ -117,14 +118,21 @@ impl std::fmt::Debug for StorageTableInner StorageTableInner { /// Create a [`StorageTableInner`] given a complete set of `columns` and a partial - /// set of `column_ids`. The output will only contains columns with the given ids in the same - /// order. + /// set of `output_column_ids`. + /// When reading from the storage table, + /// the chunks or rows will only contain columns with the given ids (`output_column_ids`). + /// They will in the same order as the given `output_column_ids`. + /// + /// NOTE(kwannoel): The `output_column_ids` here may be slightly different + /// from those supplied to associated executors. + /// These `output_column_ids` may have `pk` appended, since they will be needed to scan from + /// storage. The associated executors may not have these `pk` fields. #[allow(clippy::too_many_arguments)] pub fn new_partial( store: S, table_id: TableId, table_columns: Vec, - column_ids: Vec, + output_column_ids: Vec, order_types: Vec, pk_indices: Vec, distribution: Distribution, @@ -137,7 +145,7 @@ impl StorageTableInner { store, table_id, table_columns, - column_ids, + output_column_ids, order_types, pk_indices, distribution, @@ -156,12 +164,12 @@ impl StorageTableInner { pk_indices: Vec, value_indices: Vec, ) -> Self { - let column_ids = columns.iter().map(|c| c.column_id).collect(); + let output_column_ids = columns.iter().map(|c| c.column_id).collect(); Self::new_inner( store, table_id, columns, - column_ids, + output_column_ids, order_types, pk_indices, Distribution::fallback(), @@ -177,7 +185,7 @@ impl StorageTableInner { store: S, table_id: TableId, table_columns: Vec, - column_ids: Vec, + output_column_ids: Vec, order_types: Vec, pk_indices: Vec, Distribution { @@ -191,7 +199,8 @@ impl StorageTableInner { ) -> Self { assert_eq!(order_types.len(), pk_indices.len()); - let (output_columns, output_indices) = find_columns_by_ids(&table_columns, &column_ids); + let (output_columns, output_indices) = + find_columns_by_ids(&table_columns, &output_column_ids); let mut value_output_indices = vec![]; let mut key_output_indices = vec![]; diff --git a/src/storage/src/table/mod.rs b/src/storage/src/table/mod.rs index b6407528d5272..e22b154ccfc93 100644 --- a/src/storage/src/table/mod.rs +++ b/src/storage/src/table/mod.rs @@ -128,12 +128,13 @@ where } /// Collects data chunks from stream of rows. -pub async fn collect_data_chunk_with_builder( +pub async fn collect_data_chunk_with_builder( stream: &mut S, builder: &mut DataChunkBuilder, ) -> Result, E> where - S: Stream> + Unpin, + R: Row, + S: Stream> + Unpin, { // TODO(kwannoel): If necessary, we can optimize it in the future. // This can be done by moving the check if builder is full from `append_one_row` to here, @@ -206,6 +207,7 @@ fn check_vnode_is_set(vnode: VirtualNode, vnodes: &Bitmap) { ); } +#[derive(Debug)] pub struct KeyedRow> { vnode_prefixed_key: TableKey, row: OwnedRow, @@ -230,6 +232,10 @@ impl> KeyedRow { pub fn key(&self) -> &[u8] { self.vnode_prefixed_key.key_part() } + + pub fn into_parts(self) -> (TableKey, OwnedRow) { + (self.vnode_prefixed_key, self.row) + } } impl> Deref for KeyedRow { diff --git a/src/stream/src/common/builder.rs b/src/stream/src/common/builder.rs index 947a79f3747c9..6180b2cd69163 100644 --- a/src/stream/src/common/builder.rs +++ b/src/stream/src/common/builder.rs @@ -12,136 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -use risingwave_common::array::stream_record::Record; -use risingwave_common::array::{ArrayBuilderImpl, Op, StreamChunk}; +// Re-export `StreamChunkBuilder`. +pub use risingwave_common::array::stream_chunk_builder::StreamChunkBuilder; +use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::row::Row; use risingwave_common::types::{DataType, DatumRef}; -use risingwave_common::util::iter_util::ZipEqFast; - -/// Build stream chunks with fixed chunk size from rows or records. -pub struct StreamChunkBuilder { - /// operations in the data chunk to build - ops: Vec, - - /// arrays in the data chunk to build - column_builders: Vec, - - /// Data types of columns - data_types: Vec, - - /// Maximum capacity of column builder - capacity: usize, - - /// Size of column builder - size: usize, -} - -impl Drop for StreamChunkBuilder { - fn drop(&mut self) { - // Possible to fail when async task gets cancelled. - if self.size != 0 { - tracing::warn!( - remaining = self.size, - "dropping non-empty stream chunk builder" - ); - } - } -} - -impl StreamChunkBuilder { - pub fn new(chunk_size: usize, data_types: Vec) -> Self { - assert!(chunk_size > 0); - - let ops = Vec::with_capacity(chunk_size); - let column_builders = data_types - .iter() - .map(|datatype| datatype.create_array_builder(chunk_size)) - .collect(); - Self { - ops, - column_builders, - data_types, - capacity: chunk_size, - size: 0, - } - } - - /// Increase chunk size - /// - /// A [`StreamChunk`] will be returned when `size == capacity` - #[must_use] - fn inc_size(&mut self) -> Option { - self.size += 1; - - // Take a chunk when capacity is exceeded. Splitting `UpdateDelete` and `UpdateInsert` - // should be avoided, so when the last one is `UpdateDelete`, we delay the chunk until - // `UpdateInsert` comes. This means the output chunk size may exceed the given `chunk_size`, - // and theoretically at most `chunk_size + 1` if inputs are consistent. - if self.size >= self.capacity && self.ops[self.ops.len() - 1] != Op::UpdateDelete { - self.take() - } else { - None - } - } - - /// Append an iterator of output index and datum to the builder, return a chunk if the builder - /// is full. - /// Note: the caller must ensure that each column occurs exactly once in `iter`. - fn append_iter<'a>( - &mut self, - op: Op, - iter: impl IntoIterator)>, - ) -> Option { - self.ops.push(op); - for (i, datum) in iter { - self.column_builders[i].append(datum); - } - self.inc_size() - } - - /// Append a row to the builder, return a chunk if the builder is full. - #[must_use] - pub fn append_row(&mut self, op: Op, row: impl Row) -> Option { - self.append_iter(op, row.iter().enumerate()) - } - - /// Append a record to the builder, return a chunk if the builder is full. - #[must_use] - pub fn append_record(&mut self, record: Record) -> Option { - match record { - Record::Insert { new_row } => self.append_row(Op::Insert, new_row), - Record::Delete { old_row } => self.append_row(Op::Delete, old_row), - Record::Update { old_row, new_row } => { - let none = self.append_row(Op::UpdateDelete, old_row); - debug_assert!(none.is_none()); - self.append_row(Op::UpdateInsert, new_row) - } - } - } - - #[must_use] - pub fn take(&mut self) -> Option { - if self.size == 0 { - return None; - } - - self.size = 0; - let new_columns = self - .column_builders - .iter_mut() - .zip_eq_fast(&self.data_types) - .map(|(builder, datatype)| { - std::mem::replace(builder, datatype.create_array_builder(self.capacity)).finish() - }) - .map(Into::into) - .collect::>(); - - Some(StreamChunk::new( - std::mem::replace(&mut self.ops, Vec::with_capacity(self.capacity)), - new_columns, - )) - } -} type IndexMappings = Vec<(usize, usize)>; diff --git a/src/stream/src/common/table/state_table.rs b/src/stream/src/common/table/state_table.rs index bfdd50f883fd7..b2b172cc79858 100644 --- a/src/stream/src/common/table/state_table.rs +++ b/src/stream/src/common/table/state_table.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::default::Default; use std::ops::Bound; use std::ops::Bound::*; use std::sync::Arc; @@ -25,10 +26,12 @@ use risingwave_common::array::stream_record::Record; use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::buffer::Bitmap; use risingwave_common::cache::CachePriority; -use risingwave_common::catalog::{get_dist_key_in_pk_indices, ColumnDesc, TableId, TableOption}; +use risingwave_common::catalog::{ + get_dist_key_in_pk_indices, ColumnDesc, ColumnId, TableId, TableOption, +}; use risingwave_common::hash::{VirtualNode, VnodeBitmapExt}; use risingwave_common::row::{self, once, CompactedRow, Once, OwnedRow, Row, RowExt}; -use risingwave_common::types::{Datum, DefaultOrd, DefaultOrdered, ScalarImpl}; +use risingwave_common::types::{DataType, Datum, DefaultOrd, DefaultOrdered, ScalarImpl}; use risingwave_common::util::epoch::EpochPair; use risingwave_common::util::iter_util::{ZipEqDebug, ZipEqFast}; use risingwave_common::util::row_serde::OrderedRowSerde; @@ -42,6 +45,7 @@ use risingwave_pb::catalog::Table; use risingwave_storage::error::{ErrorKind, StorageError, StorageResult}; use risingwave_storage::hummock::CachePolicy; use risingwave_storage::mem_table::MemTableError; +use risingwave_storage::row_serde::find_columns_by_ids; use risingwave_storage::row_serde::row_serde_util::{ deserialize_pk_with_vnode, serialize_pk, serialize_pk_with_vnode, }; @@ -138,13 +142,23 @@ pub struct StateTableInner< /// Watermark cache watermark_cache: StateTableWatermarkCache, + + /// Data Types + /// We will need to use to build data chunks from state table rows. + data_types: Vec, + + /// Output indices + /// Used for: + /// 1. Computing output_value_indices to ser/de replicated rows. + /// 2. Computing output pk indices to used them for backfill state. + output_indices: Vec, } /// `StateTable` will use `BasicSerde` as default pub type StateTable = StateTableInner; /// `ReplicatedStateTable` is meant to replicate upstream shared buffer. /// Used for `ArrangementBackfill` executor. -pub type ReplicatedStateTable = StateTableInner; +pub type ReplicatedStateTable = StateTableInner; /// `WatermarkCacheStateTable` caches the watermark column. /// It will reduce state cleaning overhead. pub type WatermarkCacheStateTable = @@ -202,7 +216,7 @@ where store: S, vnodes: Option>, ) -> Self { - Self::from_table_catalog_inner(table_catalog, store, vnodes, true).await + Self::from_table_catalog_inner(table_catalog, store, vnodes, true, vec![]).await } /// Create state table from table catalog and store with sanity check disabled. @@ -211,7 +225,7 @@ where store: S, vnodes: Option>, ) -> Self { - Self::from_table_catalog_inner(table_catalog, store, vnodes, false).await + Self::from_table_catalog_inner(table_catalog, store, vnodes, false, vec![]).await } /// Create state table from table catalog and store. @@ -220,6 +234,7 @@ where store: S, vnodes: Option>, is_consistent_op: bool, + output_indices: Vec, ) -> Self { let table_id = TableId::new(table_catalog.id); let table_columns: Vec = table_catalog @@ -227,6 +242,17 @@ where .iter() .map(|col| col.column_desc.as_ref().unwrap().into()) .collect(); + let data_types: Vec = table_catalog + .columns + .iter() + .map(|col| { + col.get_column_desc() + .unwrap() + .get_column_type() + .unwrap() + .into() + }) + .collect(); let order_types: Vec = table_catalog .pk .iter() @@ -299,9 +325,15 @@ where Arc::from_iter(table_catalog.value_indices.iter().map(|val| *val as usize)), Arc::from(table_columns.into_boxed_slice()), ); + + // If state table has versioning, that means it supports + // Schema change. In that case, the row encoding should be column aware as well. + // Otherwise both will be false. + // NOTE(kwannoel): Replicated table will follow upstream table's versioning. I'm not sure + // If ALTER TABLE will propagate to this replicated table as well. Ideally it won't assert_eq!( - row_serde.kind().is_column_aware(), - table_catalog.version.is_some() + table_catalog.version.is_some(), + row_serde.kind().is_column_aware() ); let watermark_cache = if USE_WATERMARK_CACHE { @@ -326,6 +358,8 @@ where state_clean_watermark: None, prev_cleaned_watermark: None, watermark_cache, + data_types, + output_indices, } } @@ -458,7 +492,10 @@ where TableOption::default(), )) .await; - + let data_types: Vec = table_columns + .iter() + .map(|col| col.data_type.clone()) + .collect(); let pk_data_types = pk_indices .iter() .map(|i| table_columns[*i].data_type.clone()) @@ -470,7 +507,6 @@ where } else { StateTableWatermarkCache::new(0) }; - Self { table_id, local_store: local_state_store, @@ -495,9 +531,15 @@ where state_clean_watermark: None, prev_cleaned_watermark: None, watermark_cache, + data_types, + output_indices: vec![], } } + pub fn get_data_types(&self) -> &[DataType] { + &self.data_types + } + pub fn table_id(&self) -> u32 { self.table_id.table_id } @@ -541,11 +583,23 @@ where compute_vnode(pk, &self.dist_key_in_pk_indices, &self.vnodes) } - // TODO: remove, should not be exposed to user + /// NOTE(kwannoel): This is used by backfill. + /// We want to check pk indices of upstream table. pub fn pk_indices(&self) -> &[usize] { &self.pk_indices } + /// Get the indices of the primary key columns in the output columns. + /// + /// Returns `None` if any of the primary key columns is not in the output columns. + pub fn pk_in_output_indices(&self) -> Option> { + assert!(IS_REPLICATED); + self.pk_indices + .iter() + .map(|&i| self.output_indices.iter().position(|&j| i == j)) + .collect() + } + pub fn pk_serde(&self) -> &OrderedRowSerde { &self.pk_serde } @@ -571,6 +625,29 @@ where } } +impl StateTableInner +where + S: StateStore, + SD: ValueRowSerde, + W: WatermarkBufferStrategy, +{ + /// Create replicated state table from table catalog with output indices + pub async fn from_table_catalog_with_output_column_ids( + table_catalog: &Table, + store: S, + vnodes: Option>, + output_column_ids: Vec, + ) -> Self { + let columns = table_catalog + .columns + .iter() + .map(|c| c.column_desc.as_ref().unwrap().into()) + .collect_vec(); + let (_, output_indices) = find_columns_by_ids(&columns[..], &output_column_ids); + Self::from_table_catalog_inner(table_catalog, store, vnodes, false, output_indices).await + } +} + // point get impl< S, @@ -589,7 +666,14 @@ where match encoded_row { Some(encoded_row) => { let row = self.row_serde.deserialize(&encoded_row)?; - Ok(Some(OwnedRow::new(row))) + if IS_REPLICATED { + // If the table is replicated, we need to deserialize the row with the output + // indices. + let row = row.project(&self.output_indices); + Ok(Some(row.into_owned_row())) + } else { + Ok(Some(OwnedRow::new(row))) + } } None => Ok(None), } @@ -1131,6 +1215,25 @@ where )) } + pub async fn iter_with_vnode_and_output_indices( + &self, + vnode: VirtualNode, + pk_range: &(Bound, Bound), + prefetch_options: PrefetchOptions, + ) -> StreamExecutorResult>> + '_> { + assert!(IS_REPLICATED); + let stream = self + .iter_with_vnode(vnode, pk_range, prefetch_options) + .await?; + Ok(stream.map(|row| { + row.map(|keyed_row| { + let (vnode_prefixed_key, row) = keyed_row.into_parts(); + let row = row.project(&self.output_indices).into_owned_row(); + KeyedRow::new(vnode_prefixed_key, row) + }) + })) + } + async fn iter_kv( &self, key_range: (Bound, Bound), @@ -1139,12 +1242,11 @@ where ) -> StreamExecutorResult<::IterStream<'_>> { let read_options = ReadOptions { prefix_hint, - ignore_range_tombstone: false, retention_seconds: self.table_option.retention_seconds, table_id: self.table_id, - read_version_from_backup: false, prefetch_options, cache_policy: CachePolicy::Fill(CachePriority::High), + ..Default::default() }; let table_key_range = map_table_key_range(key_range); @@ -1225,7 +1327,6 @@ where prefetch_options: PrefetchOptions, ) -> StreamExecutorResult<::IterStream<'_>> { let memcomparable_range = prefix_range_to_memcomparable(&self.pk_serde, pk_range); - let memcomparable_range_with_vnode = prefixed_range_with_vnode(memcomparable_range, vnode); // TODO: provide a trace of useful params. @@ -1273,12 +1374,9 @@ where let read_options = ReadOptions { prefix_hint, - ignore_range_tombstone: false, - retention_seconds: None, table_id: self.table_id, - read_version_from_backup: false, - prefetch_options: Default::default(), cache_policy: CachePolicy::Fill(CachePriority::High), + ..Default::default() }; self.local_store diff --git a/src/stream/src/executor/agg_common.rs b/src/stream/src/executor/agg_common.rs index b1feac670d942..6df0e58c6ace9 100644 --- a/src/stream/src/executor/agg_common.rs +++ b/src/stream/src/executor/agg_common.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::collections::HashMap; -use std::sync::Arc; use risingwave_expr::aggregate::AggCall; use risingwave_pb::stream_plan::PbAggNodeVersion; @@ -22,7 +21,6 @@ use risingwave_storage::StateStore; use super::aggregation::AggStateStorage; use super::{Executor, ExecutorInfo}; use crate::common::table::state_table::StateTable; -use crate::executor::monitor::StreamingMetrics; use crate::executor::ActorContextRef; use crate::task::AtomicU64Ref; @@ -45,7 +43,7 @@ pub struct AggExecutorArgs { pub intermediate_state_table: StateTable, pub distinct_dedup_tables: HashMap>, pub watermark_epoch: AtomicU64Ref, - pub metrics: Arc, + // extra pub extra: E, } diff --git a/src/stream/src/executor/aggregation/distinct.rs b/src/stream/src/executor/aggregation/distinct.rs index 9e1d8d66da848..079ddf8661ae0 100644 --- a/src/stream/src/executor/aggregation/distinct.rs +++ b/src/stream/src/executor/aggregation/distinct.rs @@ -29,24 +29,20 @@ use super::{AggCall, GroupKey}; use crate::cache::{new_unbounded, ManagedLruCache}; use crate::common::metrics::MetricsInfo; use crate::common::table::state_table::StateTable; -use crate::executor::monitor::StreamingMetrics; use crate::executor::{ActorContextRef, StreamExecutorResult}; -use crate::task::ActorId; type DedupCache = ManagedLruCache>; /// Deduplicater for one distinct column. struct ColumnDeduplicater { cache: DedupCache, - metrics_info: MetricsInfo, _phantom: PhantomData, } impl ColumnDeduplicater { - fn new(watermark_epoch: &Arc, metrics_info: MetricsInfo) -> Self { + fn new(watermark_epoch: Arc, metrics_info: MetricsInfo) -> Self { Self { - cache: new_unbounded(watermark_epoch.clone(), metrics_info.clone()), - metrics_info, + cache: new_unbounded(watermark_epoch, metrics_info), _phantom: PhantomData, } } @@ -83,8 +79,7 @@ impl ColumnDeduplicater { let cache_key = CompactedRow::from(group_key.map(GroupKey::cache_key).chain(row::once(datum))); - self.metrics_info - .metrics + ctx.streaming_metrics .agg_distinct_total_cache_count .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc(); @@ -93,8 +88,7 @@ impl ColumnDeduplicater { let mut counts = if self.cache.contains(&cache_key) { self.cache.get_mut(&cache_key).unwrap() } else { - self.metrics_info - .metrics + ctx.streaming_metrics .agg_distinct_cache_miss_count .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc(); @@ -190,15 +184,15 @@ impl ColumnDeduplicater { // TODO(rc): now we flush the table in `dedup` method. // WARN: if you want to change to batching the write to table. please remember to change // `self.cache.evict()` too. + self.cache.evict(); + let actor_id_str = ctx.id.to_string(); let fragment_id_str = ctx.fragment_id.to_string(); let table_id_str = dedup_table.table_id().to_string(); - self.metrics_info - .metrics + ctx.streaming_metrics .agg_distinct_cached_entry_count .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .set(self.cache.len() as i64); - self.cache.evict(); } } @@ -218,16 +212,17 @@ pub struct DistinctDeduplicater { /// Key: distinct column index; /// Value: (agg call indices that distinct on the column, deduplicater for the column). deduplicaters: HashMap, ColumnDeduplicater)>, + ctx: ActorContextRef, } impl DistinctDeduplicater { pub fn new( agg_calls: &[AggCall], - watermark_epoch: &Arc, + watermark_epoch: Arc, distinct_dedup_tables: &HashMap>, - actor_id: ActorId, - metrics: Arc, + ctx: ActorContextRef, ) -> Self { + let actor_id = ctx.id; let deduplicaters: HashMap<_, _> = agg_calls .iter() .enumerate() @@ -236,14 +231,18 @@ impl DistinctDeduplicater { .into_iter() .map(|(distinct_col, indices_and_calls)| { let table_id = distinct_dedup_tables.get(&distinct_col).unwrap().table_id(); - let metrics_info = - MetricsInfo::new(metrics.clone(), table_id, actor_id, "distinct dedup"); + let metrics_info = MetricsInfo::new( + ctx.streaming_metrics.clone(), + table_id, + actor_id, + "distinct dedup", + ); let call_indices: Box<[_]> = indices_and_calls.into_iter().map(|v| v.0).collect(); - let deduplicater = ColumnDeduplicater::new(watermark_epoch, metrics_info); + let deduplicater = ColumnDeduplicater::new(watermark_epoch.clone(), metrics_info); (distinct_col, (call_indices, deduplicater)) }) .collect(); - Self { deduplicaters } + Self { deduplicaters, ctx } } pub fn dedup_caches_mut(&mut self) -> impl Iterator { @@ -261,7 +260,6 @@ impl DistinctDeduplicater { mut visibilities: Vec, dedup_tables: &mut HashMap>, group_key: Option<&GroupKey>, - ctx: ActorContextRef, ) -> StreamExecutorResult> { for (distinct_col, (ref call_indices, deduplicater)) in &mut self.deduplicaters { let column = &columns[*distinct_col]; @@ -277,7 +275,7 @@ impl DistinctDeduplicater { visibilities, dedup_table, group_key, - ctx.clone(), + self.ctx.clone(), ) .await?; } @@ -288,11 +286,10 @@ impl DistinctDeduplicater { pub fn flush( &mut self, dedup_tables: &mut HashMap>, - ctx: ActorContextRef, ) -> StreamExecutorResult<()> { for (distinct_col, (_, deduplicater)) in &mut self.deduplicaters { let dedup_table = dedup_tables.get_mut(distinct_col).unwrap(); - deduplicater.flush(dedup_table, ctx.clone()); + deduplicater.flush(dedup_table, self.ctx.clone()); } Ok(()) } @@ -309,7 +306,6 @@ mod tests { use risingwave_storage::memory::MemoryStateStore; use super::*; - use crate::executor::monitor::StreamingMetrics; use crate::executor::ActorContext; async fn infer_dedup_tables( @@ -394,10 +390,9 @@ mod tests { let mut deduplicater = DistinctDeduplicater::new( &agg_calls, - &Arc::new(AtomicU64::new(0)), + Arc::new(AtomicU64::new(0)), &dedup_tables, - 0, - Arc::new(StreamingMetrics::unused()), + ActorContext::create(0), ); // --- chunk 1 --- @@ -413,14 +408,7 @@ mod tests { .take(agg_calls.len()) .collect_vec(); let visibilities = deduplicater - .dedup_chunk( - &ops, - &columns, - visibilities, - &mut dedup_tables, - None, - ActorContext::create(0), - ) + .dedup_chunk(&ops, &columns, visibilities, &mut dedup_tables, None) .await .unwrap(); assert_eq!( @@ -440,9 +428,7 @@ mod tests { vec![true, true] // distinct on b ); - deduplicater - .flush(&mut dedup_tables, ActorContext::create(0)) - .unwrap(); + deduplicater.flush(&mut dedup_tables).unwrap(); epoch.inc(); for table in dedup_tables.values_mut() { @@ -463,14 +449,7 @@ mod tests { .take(agg_calls.len()) .collect_vec(); let visibilities = deduplicater - .dedup_chunk( - &ops, - &columns, - visibilities, - &mut dedup_tables, - None, - ActorContext::create(0), - ) + .dedup_chunk(&ops, &columns, visibilities, &mut dedup_tables, None) .await .unwrap(); assert_eq!( @@ -490,9 +469,7 @@ mod tests { vec![false, false, true] // distinct on b ); - deduplicater - .flush(&mut dedup_tables, ActorContext::create(0)) - .unwrap(); + deduplicater.flush(&mut dedup_tables).unwrap(); epoch.inc(); for table in dedup_tables.values_mut() { @@ -504,10 +481,9 @@ mod tests { // test recovery let mut deduplicater = DistinctDeduplicater::new( &agg_calls, - &Arc::new(AtomicU64::new(0)), + Arc::new(AtomicU64::new(0)), &dedup_tables, - 0, - Arc::new(StreamingMetrics::unused()), + ActorContext::create(0), ); // --- chunk 3 --- @@ -524,14 +500,7 @@ mod tests { .take(agg_calls.len()) .collect_vec(); let visibilities = deduplicater - .dedup_chunk( - &ops, - &columns, - visibilities, - &mut dedup_tables, - None, - ActorContext::create(0), - ) + .dedup_chunk(&ops, &columns, visibilities, &mut dedup_tables, None) .await .unwrap(); assert_eq!( @@ -566,9 +535,7 @@ mod tests { ] ); - deduplicater - .flush(&mut dedup_tables, ActorContext::create(0)) - .unwrap(); + deduplicater.flush(&mut dedup_tables).unwrap(); epoch.inc(); for table in dedup_tables.values_mut() { @@ -603,10 +570,9 @@ mod tests { let mut deduplicater = DistinctDeduplicater::new( &agg_calls, - &Arc::new(AtomicU64::new(0)), + Arc::new(AtomicU64::new(0)), &dedup_tables, - 0, - Arc::new(StreamingMetrics::unused()), + ActorContext::create(0), ); let chunk = StreamChunk::from_pretty( @@ -629,7 +595,6 @@ mod tests { visibilities, &mut dedup_tables, Some(&group_key), - ActorContext::create(0), ) .await .unwrap(); @@ -646,9 +611,7 @@ mod tests { vec![true, true, false, false, true] // distinct on b ); - deduplicater - .flush(&mut dedup_tables, ActorContext::create(0)) - .unwrap(); + deduplicater.flush(&mut dedup_tables).unwrap(); epoch.inc(); for table in dedup_tables.values_mut() { @@ -673,7 +636,6 @@ mod tests { visibilities, &mut dedup_tables, Some(&group_key), - ActorContext::create(0), ) .await .unwrap(); @@ -700,9 +662,7 @@ mod tests { ] ); - deduplicater - .flush(&mut dedup_tables, ActorContext::create(0)) - .unwrap(); + deduplicater.flush(&mut dedup_tables).unwrap(); epoch.inc(); for table in dedup_tables.values_mut() { diff --git a/src/stream/src/executor/backfill/arrangement_backfill.rs b/src/stream/src/executor/backfill/arrangement_backfill.rs index 0bd6e47841584..28fcaa8862faa 100644 --- a/src/stream/src/executor/backfill/arrangement_backfill.rs +++ b/src/stream/src/executor/backfill/arrangement_backfill.rs @@ -12,35 +12,35 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; use std::pin::pin; use std::sync::Arc; use either::Either; use futures::stream::select_with_strategy; -use futures::{pin_mut, stream, StreamExt, TryStreamExt}; +use futures::{stream, StreamExt, TryStreamExt}; use futures_async_stream::try_stream; use itertools::Itertools; use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::bail; use risingwave_common::catalog::Schema; use risingwave_common::hash::{VirtualNode, VnodeBitmapExt}; -use risingwave_common::types::Datum; use risingwave_common::util::chunk_coalesce::DataChunkBuilder; use risingwave_common::util::iter_util::ZipEqDebug; -use risingwave_common::util::select_all; +use risingwave_storage::row_serde::value_serde::ValueRowSerde; use risingwave_storage::StateStore; -use crate::common::table::state_table::ReplicatedStateTable; +use crate::common::table::state_table::{ReplicatedStateTable, StateTable}; +#[cfg(debug_assertions)] +use crate::executor::backfill::utils::METADATA_STATE_LEN; use crate::executor::backfill::utils::{ - compute_bounds, construct_initial_finished_state, get_progress_per_vnode, iter_chunks, - mapping_chunk, mapping_message, mark_chunk_ref_by_vnode, owned_row_iter, - persist_state_per_vnode, update_pos_by_vnode, BackfillProgressPerVnode, BackfillState, + compute_bounds, create_builder, get_progress_per_vnode, iter_chunks, mapping_chunk, + mapping_message, mark_chunk_ref_by_vnode, owned_row_iter, persist_state_per_vnode, + update_pos_by_vnode, BackfillProgressPerVnode, BackfillState, }; use crate::executor::monitor::StreamingMetrics; use crate::executor::{ expect_first_barrier, Barrier, BoxedExecutor, BoxedMessageStream, Executor, ExecutorInfo, - Message, PkIndices, PkIndicesRef, StreamExecutorError, + Message, PkIndicesRef, StreamExecutorError, }; use crate::task::{ActorId, CreateMviewProgress}; @@ -49,15 +49,15 @@ use crate::task::{ActorId, CreateMviewProgress}; /// - [`ArrangementBackfillExecutor`] can reside on a different CN, so it can be scaled /// independently. /// - To synchronize upstream shared buffer, it is initialized with a [`ReplicatedStateTable`]. -pub struct ArrangementBackfillExecutor { +pub struct ArrangementBackfillExecutor { /// Upstream table - upstream_table: ReplicatedStateTable, + upstream_table: ReplicatedStateTable, /// Upstream with the same schema with the upstream table. upstream: BoxedExecutor, /// Internal state table for persisting state of backfill state. - state_table: ReplicatedStateTable, + state_table: StateTable, /// The column indices need to be forwarded to the downstream from the upstream and table scan. output_indices: Vec, @@ -71,31 +71,30 @@ pub struct ArrangementBackfillExecutor { metrics: Arc, chunk_size: usize, + + rate_limit: Option, } -impl ArrangementBackfillExecutor +impl ArrangementBackfillExecutor where S: StateStore, + SD: ValueRowSerde, { #[allow(clippy::too_many_arguments)] #[allow(dead_code)] pub fn new( - upstream_table: ReplicatedStateTable, + info: ExecutorInfo, + upstream_table: ReplicatedStateTable, upstream: BoxedExecutor, - state_table: ReplicatedStateTable, + state_table: StateTable, output_indices: Vec, progress: CreateMviewProgress, - schema: Schema, - pk_indices: PkIndices, metrics: Arc, chunk_size: usize, + rate_limit: Option, ) -> Self { Self { - info: ExecutorInfo { - schema, - pk_indices, - identity: "ArrangementBackfillExecutor".to_owned(), - }, + info, upstream_table, upstream, state_table, @@ -104,6 +103,7 @@ where progress, metrics, chunk_size, + rate_limit, } } @@ -111,79 +111,62 @@ where async fn execute_inner(mut self) { // The primary key columns, in the output columns of the upstream_table scan. // Table scan scans a subset of the columns of the upstream table. - let pk_in_output_indices = self - .upstream_table - .pk_indices() - .iter() - .map(|&i| self.output_indices.iter().position(|&j| i == j)) - .collect::>>() - .unwrap(); - let state_len = pk_in_output_indices.len() + 2; // +1 for backfill_finished, +1 for vnode key. + let pk_in_output_indices = self.upstream_table.pk_in_output_indices().unwrap(); + #[cfg(debug_assertions)] + let state_len = self.upstream_table.pk_indices().len() + METADATA_STATE_LEN; let pk_order = self.upstream_table.pk_serde().get_order_types().to_vec(); let upstream_table_id = self.upstream_table.table_id(); let mut upstream_table = self.upstream_table; let vnodes = upstream_table.vnodes().clone(); - let schema = Arc::new(self.upstream.schema().clone()); + // These builders will build data chunks. + // We must supply them with the full datatypes which correspond to + // pk + output_indices. + let snapshot_data_types = self + .upstream + .schema() + .fields() + .iter() + .map(|field| field.data_type.clone()) + .collect_vec(); + let mut builders = upstream_table + .vnodes() + .iter_vnodes() + .map(|_| { + create_builder( + self.rate_limit, + self.chunk_size, + snapshot_data_types.clone(), + ) + }) + .collect_vec(); let mut upstream = self.upstream.execute(); // Poll the upstream to get the first barrier. let first_barrier = expect_first_barrier(&mut upstream).await?; - self.state_table.init_epoch(first_barrier.epoch).await?; + let first_epoch = first_barrier.epoch; + self.state_table.init_epoch(first_barrier.epoch); let progress_per_vnode = get_progress_per_vnode(&self.state_table).await?; - let is_completely_finished = progress_per_vnode - .iter() - .all(|(_, p)| *p == BackfillProgressPerVnode::Completed); + let is_completely_finished = progress_per_vnode.iter().all(|(_, p)| { + matches!( + p.current_state(), + &BackfillProgressPerVnode::Completed { .. } + ) + }); if is_completely_finished { assert!(!first_barrier.is_newly_added(self.actor_id)); } - let mut backfill_state: BackfillState = progress_per_vnode.into(); - let mut committed_progress = HashMap::new(); - - let mut builders = upstream_table - .vnodes() - .iter_vnodes() - .map(|_| DataChunkBuilder::new(schema.data_types(), self.chunk_size)) - .collect_vec(); - - // If the snapshot is empty, we don't need to backfill. - // We cannot complete progress now, as we want to persist - // finished state to state store first. - // As such we will wait for next barrier. - let is_snapshot_empty: bool = { - if is_completely_finished { - // It is finished, so just assign a value to avoid accessing storage table again. - false - } else { - let snapshot = Self::snapshot_read_per_vnode( - &upstream_table, - backfill_state.clone(), // FIXME: temporary workaround... How to avoid it? - &mut builders, - ); - pin_mut!(snapshot); - snapshot.try_next().await?.unwrap().is_none() - } - }; - - // | backfill_is_finished | snapshot_empty | -> | need_to_backfill | - // | -------------------- | -------------- | -- | ---------------- | - // | t | t/f | -> | f | - // | f | t | -> | f | - // | f | f | -> | t | - let to_backfill = !is_completely_finished && !is_snapshot_empty; - - // Use these to persist state. - // They contain the backfill position, and the progress. - // However, they do not contain the vnode key (index 0). - // That is filled in when we flush the state table. - let mut temporary_state: Vec = vec![None; state_len]; - // The first barrier message should be propagated. yield Message::Barrier(first_barrier); + upstream_table.init_epoch(first_epoch).await?; + + let mut backfill_state: BackfillState = progress_per_vnode.into(); + + let to_backfill = !is_completely_finished; // If no need backfill, but state was still "unfinished" we need to finish it. // So we just update the state + progress to meta at the next barrier to finish progress, @@ -242,7 +225,7 @@ where let right_snapshot = pin!(Self::snapshot_read_per_vnode( &upstream_table, - backfill_state.clone(), // FIXME: temporary workaround, how to avoid it? + backfill_state.clone(), // FIXME: Use mutable reference instead. &mut builders, ) .map(Either::Right),); @@ -311,15 +294,16 @@ where &chunk, &pk_in_output_indices, &mut backfill_state, - ); + )?; let chunk_cardinality = chunk.cardinality() as u64; cur_barrier_snapshot_processed_rows += chunk_cardinality; total_snapshot_processed_rows += chunk_cardinality; - yield Message::Chunk(mapping_chunk( + let chunk = Message::Chunk(mapping_chunk( chunk, &self.output_indices, )); + yield chunk; } } } @@ -335,7 +319,6 @@ where Some(barrier) => barrier, None => bail!("BUG: current_backfill loop exited without a barrier"), }; - // TODO: Process existing buffered snapshots. // Process barrier: // - consume snapshot rows left in builder. @@ -360,7 +343,7 @@ where &chunk, &pk_in_output_indices, &mut backfill_state, - ); + )?; let chunk_cardinality = chunk.cardinality() as u64; cur_barrier_snapshot_processed_rows += chunk_cardinality; @@ -388,8 +371,9 @@ where &self.output_indices, )); } - // Replicate - upstream_table.write_chunk(chunk); + + // FIXME(kwannoel): Replicate + // upstream_table.write_chunk(chunk); } if upstream_chunk_buffer_is_empty { @@ -417,6 +401,9 @@ where // Update snapshot read epoch. snapshot_read_epoch = barrier.epoch.prev; + // TODO(kwannoel): Not sure if this holds for arrangement backfill. + // May need to revisit it. + // Need to check it after scale-in / scale-out. self.progress.update( barrier.epoch.curr, snapshot_read_epoch, @@ -427,13 +414,19 @@ where persist_state_per_vnode( barrier.epoch, &mut self.state_table, - false, - &backfill_state, - &mut committed_progress, - &mut temporary_state, + &mut backfill_state, + #[cfg(debug_assertions)] + state_len, + vnodes.iter_vnodes(), ) .await?; + tracing::trace!( + actor = self.actor_id, + barrier = ?barrier, + "barrier persisted" + ); + yield Message::Barrier(barrier); // We will switch snapshot at the start of the next iteration of the backfill loop. @@ -442,13 +435,20 @@ where tracing::trace!( actor = self.actor_id, - "Backfill has already finished and forward messages directly to the downstream" + "Arrangement Backfill has finished and forward messages directly to the downstream" ); + // Update our progress as finished in state table. + // Wait for first barrier to come after backfill is finished. // So we can update our progress + persist the status. while let Some(Ok(msg)) = upstream.next().await { if let Some(msg) = mapping_message(msg, &self.output_indices) { + tracing::trace!( + actor = self.actor_id, + message = ?msg, + "backfill_finished_wait_for_barrier" + ); // If not finished then we need to update state, otherwise no need. if let Message::Barrier(barrier) = &msg && !is_completely_finished @@ -459,24 +459,17 @@ where // This is because we can't update state table in first epoch, // since it expects to have been initialized in previous epoch // (there's no epoch before the first epoch). - if is_snapshot_empty { - let finished_state = - construct_initial_finished_state(pk_in_output_indices.len()); - for vnode in upstream_table.vnodes().iter_vnodes() { - backfill_state.update_progress( - vnode, - BackfillProgressPerVnode::InProgress(finished_state.clone()), - ); - } + for vnode in upstream_table.vnodes().iter_vnodes() { + backfill_state.finish_progress(vnode, upstream_table.pk_indices().len()); } persist_state_per_vnode( barrier.epoch, &mut self.state_table, - false, - &backfill_state, - &mut committed_progress, - &mut temporary_state, + &mut backfill_state, + #[cfg(debug_assertions)] + state_len, + vnodes.iter_vnodes() ) .await?; @@ -484,8 +477,10 @@ where .finish(barrier.epoch.curr, total_snapshot_processed_rows); yield msg; break; + } else { + // Allow other messages to pass through. + yield msg; } - yield msg; } } @@ -495,6 +490,11 @@ where #[for_await] for msg in upstream { if let Some(msg) = mapping_message(msg?, &self.output_indices) { + tracing::trace!( + actor = self.actor_id, + message = ?msg, + "backfill_finished_after_barrier" + ); if let Message::Barrier(barrier) = &msg { self.state_table.commit_no_data_expected(barrier.epoch); } @@ -510,19 +510,18 @@ where /// 3. Change it into a chunk iterator with `iter_chunks`. /// This means it should fetch a row from each iterator to form a chunk. /// - /// We will return chunks based on the `BackfillProgressPerVnode`. - /// 1. Completed(vnode): Current iterator is complete, in that case we need to handle it - /// in arrangement backfill. We should not buffer updates for this vnode, - /// and we should forward all messages. - /// 2. InProgress(CHUNK): Current iterator is not complete, in that case we - /// need to buffer updates for this vnode. - /// 3. Finished: All iterators finished. - /// - /// NOTE(kwannoel): We interleave at chunk per vnode level rather than rows. + /// We interleave at chunk per vnode level rather than rows. /// This is so that we can compute `current_pos` once per chunk, since they correspond to 1 /// vnode. /// - /// NOTE(kwannoel): + /// The stream contains pairs of `(VirtualNode, StreamChunk)`. + /// The `VirtualNode` is the vnode that the chunk belongs to. + /// The `StreamChunk` is the chunk that contains the rows from the vnode. + /// If it's `None`, it means the vnode has no more rows for this snapshot read. + /// + /// The `snapshot_read_epoch` is supplied as a parameter for `state_table`. + /// It is required to ensure we read a fully-checkpointed snapshot the **first time**. + /// /// The rows from upstream snapshot read will be buffered inside the `builder`. /// If snapshot is dropped before its rows are consumed, /// remaining data in `builder` must be flushed manually. @@ -530,11 +529,10 @@ where /// present, Then when we flush we contain duplicate rows. #[try_stream(ok = Option<(VirtualNode, StreamChunk)>, error = StreamExecutorError)] async fn snapshot_read_per_vnode<'a>( - upstream_table: &'a ReplicatedStateTable, + upstream_table: &'a ReplicatedStateTable, backfill_state: BackfillState, builders: &'a mut [DataChunkBuilder], ) { - let mut streams = Vec::with_capacity(upstream_table.vnodes().len()); for (vnode, builder) in upstream_table .vnodes() .iter_vnodes() @@ -542,11 +540,9 @@ where { let backfill_progress = backfill_state.get_progress(&vnode)?; let current_pos = match backfill_progress { - BackfillProgressPerVnode::Completed => { - continue; - } BackfillProgressPerVnode::NotStarted => None, - BackfillProgressPerVnode::InProgress(current_pos) => Some(current_pos.clone()), + BackfillProgressPerVnode::Completed(current_pos) + | BackfillProgressPerVnode::InProgress(current_pos) => Some(current_pos.clone()), }; let range_bounds = compute_bounds(upstream_table.pk_indices(), current_pos.clone()); @@ -555,30 +551,36 @@ where } let range_bounds = range_bounds.unwrap(); + tracing::trace!( + vnode = ?vnode, + current_pos = ?current_pos, + range_bounds = ?range_bounds, + "iter_with_vnode_and_output_indices" + ); let vnode_row_iter = upstream_table - .iter_with_vnode(vnode, &range_bounds, Default::default()) + .iter_with_vnode_and_output_indices(vnode, &range_bounds, Default::default()) .await?; - // TODO: Is there some way to avoid double-pin here? let vnode_row_iter = Box::pin(owned_row_iter(vnode_row_iter)); - let vnode_chunk_iter = iter_chunks(vnode_row_iter, builder) - .map_ok(move |chunk_opt| chunk_opt.map(|chunk| (vnode, chunk))); - // TODO: Is there some way to avoid double-pin - streams.push(Box::pin(vnode_chunk_iter)); - } - #[for_await] - for chunk in select_all(streams) { - yield chunk?; + let vnode_chunk_iter = + iter_chunks(vnode_row_iter, builder).map_ok(move |chunk| (vnode, chunk)); + + // This means we iterate serially rather than in parallel across vnodes. + #[for_await] + for chunk in vnode_chunk_iter { + yield Some(chunk?); + } } yield None; return Ok(()); } } -impl Executor for ArrangementBackfillExecutor +impl Executor for ArrangementBackfillExecutor where S: StateStore, + SD: ValueRowSerde, { fn execute(self: Box) -> BoxedMessageStream { self.execute_inner().boxed() diff --git a/src/stream/src/executor/backfill/cdc/upstream_table/snapshot.rs b/src/stream/src/executor/backfill/cdc/upstream_table/snapshot.rs index 6c5ba1affe212..cc8883aeea6b1 100644 --- a/src/stream/src/executor/backfill/cdc/upstream_table/snapshot.rs +++ b/src/stream/src/executor/backfill/cdc/upstream_table/snapshot.rs @@ -104,8 +104,9 @@ impl UpstreamTableRead for UpstreamTableReader { let chunk_stream = iter_chunks(row_stream, &mut builder); #[for_await] for chunk in chunk_stream { - yield chunk?; + yield Some(chunk?); } + yield None; } async fn current_binlog_offset(&self) -> StreamExecutorResult> { diff --git a/src/stream/src/executor/backfill/no_shuffle_backfill.rs b/src/stream/src/executor/backfill/no_shuffle_backfill.rs index 05e2df32c9a52..bf1b5709c8920 100644 --- a/src/stream/src/executor/backfill/no_shuffle_backfill.rs +++ b/src/stream/src/executor/backfill/no_shuffle_backfill.rs @@ -23,7 +23,7 @@ use risingwave_common::array::{Op, StreamChunk}; use risingwave_common::catalog::Schema; use risingwave_common::hash::VnodeBitmapExt; use risingwave_common::row::{OwnedRow, Row}; -use risingwave_common::types::{DataType, Datum}; +use risingwave_common::types::Datum; use risingwave_common::util::chunk_coalesce::DataChunkBuilder; use risingwave_common::util::epoch::EpochPair; use risingwave_common::{bail, row}; @@ -35,8 +35,8 @@ use risingwave_storage::StateStore; use crate::common::table::state_table::StateTable; use crate::executor::backfill::utils; use crate::executor::backfill::utils::{ - compute_bounds, construct_initial_finished_state, get_new_pos, iter_chunks, mapping_chunk, - mapping_message, mark_chunk, owned_row_iter, + compute_bounds, construct_initial_finished_state, create_builder, get_new_pos, iter_chunks, + mapping_chunk, mapping_message, mark_chunk, owned_row_iter, METADATA_STATE_LEN, }; use crate::executor::monitor::StreamingMetrics; use crate::executor::{ @@ -45,9 +45,6 @@ use crate::executor::{ }; use crate::task::{ActorId, CreateMviewProgress}; -/// vnode, `is_finished`, `row_count`, all occupy 1 column each. -const METADATA_STATE_LEN: usize = 3; - /// Schema: | vnode | pk ... | `backfill_finished` | `row_count` | /// We can decode that into `BackfillState` on recovery. #[derive(Debug, Eq, PartialEq)] @@ -170,7 +167,7 @@ where .await?; tracing::trace!(is_finished, row_count, "backfill state recovered"); - let mut builder = Self::create_builder( + let mut builder = create_builder( rate_limit, self.chunk_size, self.upstream_table.schema().data_types(), @@ -457,7 +454,7 @@ where "actor rate limit changed", ); assert!(builder.is_empty()); - builder = Self::create_builder( + builder = create_builder( rate_limit, self.chunk_size, self.upstream_table.schema().data_types(), @@ -646,14 +643,14 @@ where PrefetchOptions::prefetch_for_small_range_scan(), ) .await?; - let row_iter = owned_row_iter(iter); pin_mut!(row_iter); #[for_await] for chunk in iter_chunks(row_iter, builder) { - yield chunk?; + yield Some(chunk?); } + yield None; } async fn persist_state( @@ -678,29 +675,6 @@ where ) .await } - - /// Creates a data chunk builder for snapshot read. - /// If the `rate_limit` is smaller than `chunk_size`, it will take precedence. - /// This is so we can partition snapshot read into smaller chunks than chunk size. - fn create_builder( - rate_limit: Option, - chunk_size: usize, - data_types: Vec, - ) -> DataChunkBuilder { - if let Some(rate_limit) = rate_limit - && rate_limit < chunk_size - { - DataChunkBuilder::new( - data_types, - rate_limit, - ) - } else { - DataChunkBuilder::new( - data_types, - chunk_size, - ) - } - } } impl Executor for BackfillExecutor diff --git a/src/stream/src/executor/backfill/utils.rs b/src/stream/src/executor/backfill/utils.rs index 663f9be94cf5e..d344b23c294dc 100644 --- a/src/stream/src/executor/backfill/utils.rs +++ b/src/stream/src/executor/backfill/utils.rs @@ -27,7 +27,7 @@ use risingwave_common::bail; use risingwave_common::buffer::BitmapBuilder; use risingwave_common::hash::{VirtualNode, VnodeBitmapExt}; use risingwave_common::row::{OwnedRow, Row, RowExt}; -use risingwave_common::types::Datum; +use risingwave_common::types::{DataType, Datum}; use risingwave_common::util::chunk_coalesce::DataChunkBuilder; use risingwave_common::util::epoch::EpochPair; use risingwave_common::util::iter_util::ZipEqDebug; @@ -45,21 +45,31 @@ use crate::executor::{ Message, PkIndicesRef, StreamExecutorError, StreamExecutorResult, Watermark, }; +/// `vnode`, `is_finished`, `row_count`, all occupy 1 column each. +pub const METADATA_STATE_LEN: usize = 3; + #[derive(Clone, Debug)] pub struct BackfillState { /// Used to track backfill progress. - inner: HashMap, + // TODO: Instead of using hashmap, perhaps we can just use static array. + inner: HashMap, } impl BackfillState { - fn has_no_progress(&self) -> bool { - self.inner - .values() - .all(|p| !matches!(p, BackfillProgressPerVnode::InProgress(_))) + pub(crate) fn has_progress(&self) -> bool { + self.inner.values().any(|p| { + matches!( + p.current_state(), + &BackfillProgressPerVnode::InProgress { .. } + ) + }) } - pub(crate) fn has_progress(&self) -> bool { - !self.has_no_progress() + pub(crate) fn get_current_state( + &mut self, + vnode: &VirtualNode, + ) -> &mut BackfillProgressPerVnode { + &mut self.inner.get_mut(vnode).unwrap().current_state } // Expects the vnode to always have progress, otherwise it will return an error. @@ -68,7 +78,7 @@ impl BackfillState { vnode: &VirtualNode, ) -> StreamExecutorResult<&BackfillProgressPerVnode> { match self.inner.get(vnode) { - Some(p) => Ok(p), + Some(p) => Ok(p.current_state()), None => bail!( "Backfill progress for vnode {:#?} not found, backfill_state not initialized properly", vnode, @@ -79,20 +89,132 @@ impl BackfillState { pub(crate) fn update_progress( &mut self, vnode: VirtualNode, - progress: BackfillProgressPerVnode, - ) -> Option { - self.inner.insert(vnode, progress) + new_pos: OwnedRow, + ) -> StreamExecutorResult<()> { + let state = self.get_current_state(&vnode); + let new_state = BackfillProgressPerVnode::InProgress(new_pos); + match state { + BackfillProgressPerVnode::NotStarted => *state = new_state, + BackfillProgressPerVnode::InProgress(_current_pos) => *state = new_state, + BackfillProgressPerVnode::Completed { .. } => unreachable!(), + } + Ok(()) } - fn iter_backfill_progress( - &self, - ) -> impl Iterator { - self.inner.iter() + pub(crate) fn finish_progress(&mut self, vnode: VirtualNode, pos_len: usize) { + let finished_placeholder_position = construct_initial_finished_state(pos_len); + let current_state = self.get_current_state(&vnode); + let new_pos = match current_state { + BackfillProgressPerVnode::NotStarted => finished_placeholder_position, + BackfillProgressPerVnode::InProgress(current_pos) => current_pos.clone(), + BackfillProgressPerVnode::Completed { .. } => { + return; + } + }; + *current_state = BackfillProgressPerVnode::Completed(new_pos); + } + + /// Return state to be committed. + fn get_commit_state(&self, vnode: &VirtualNode) -> Option<(Option>, Vec)> { + let new_state = self.inner.get(vnode).unwrap().current_state().clone(); + let new_encoded_state = match new_state { + BackfillProgressPerVnode::NotStarted => unreachable!(), + BackfillProgressPerVnode::InProgress(current_pos) => { + let mut encoded_state = vec![None; current_pos.len() + METADATA_STATE_LEN]; + encoded_state[0] = Some(vnode.to_scalar().into()); + encoded_state[1..current_pos.len() + 1].clone_from_slice(current_pos.as_inner()); + encoded_state[current_pos.len() + 1] = Some(false.into()); + encoded_state[current_pos.len() + 2] = Some(0i64.into()); + encoded_state + } + BackfillProgressPerVnode::Completed(current_pos) => { + let mut encoded_state = vec![None; current_pos.len() + METADATA_STATE_LEN]; + encoded_state[0] = Some(vnode.to_scalar().into()); + encoded_state[1..current_pos.len() + 1].clone_from_slice(current_pos.as_inner()); + encoded_state[current_pos.len() + 1] = Some(true.into()); + encoded_state[current_pos.len() + 2] = Some(0i64.into()); + encoded_state + } + }; + let old_state = self.inner.get(vnode).unwrap().committed_state().clone(); + let old_encoded_state = match old_state { + BackfillProgressPerVnode::NotStarted => None, + BackfillProgressPerVnode::InProgress(committed_pos) => { + let mut encoded_state = vec![None; committed_pos.len() + METADATA_STATE_LEN]; + encoded_state[0] = Some(vnode.to_scalar().into()); + encoded_state[1..committed_pos.len() + 1] + .clone_from_slice(committed_pos.as_inner()); + encoded_state[committed_pos.len() + 1] = Some(false.into()); + encoded_state[committed_pos.len() + 2] = Some(0i64.into()); + Some(encoded_state) + } + BackfillProgressPerVnode::Completed(committed_pos) => { + let mut encoded_state = vec![None; committed_pos.len() + METADATA_STATE_LEN]; + encoded_state[0] = Some(vnode.to_scalar().into()); + encoded_state[1..committed_pos.len() + 1] + .clone_from_slice(committed_pos.as_inner()); + encoded_state[committed_pos.len() + 1] = Some(true.into()); + encoded_state[committed_pos.len() + 2] = Some(0i64.into()); + Some(encoded_state) + } + }; + Some((old_encoded_state, new_encoded_state)) + } + + // TODO: We can add a committed flag to speed up this check. + /// Checks if the state needs to be committed. + fn need_commit(&self, vnode: &VirtualNode) -> bool { + let state = self.inner.get(vnode).unwrap(); + match state.current_state() { + // If current state and committed state are the same, we don't need to commit. + s @ BackfillProgressPerVnode::InProgress(_current_pos) + | s @ BackfillProgressPerVnode::Completed(_current_pos) => s != state.committed_state(), + BackfillProgressPerVnode::NotStarted => false, + } + } + + fn mark_committed(&mut self, vnode: VirtualNode) { + let BackfillStatePerVnode { + committed_state, + current_state, + } = self.inner.get_mut(&vnode).unwrap(); + + assert!(matches!( + current_state, + BackfillProgressPerVnode::InProgress(_) | BackfillProgressPerVnode::Completed(_) + )); + *committed_state = current_state.clone(); + } +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct BackfillStatePerVnode { + committed_state: BackfillProgressPerVnode, + current_state: BackfillProgressPerVnode, +} + +impl BackfillStatePerVnode { + pub(crate) fn new( + committed_state: BackfillProgressPerVnode, + current_state: BackfillProgressPerVnode, + ) -> Self { + Self { + committed_state, + current_state, + } + } + + pub(crate) fn committed_state(&self) -> &BackfillProgressPerVnode { + &self.committed_state + } + + pub(crate) fn current_state(&self) -> &BackfillProgressPerVnode { + &self.current_state } } -impl From> for BackfillState { - fn from(v: Vec<(VirtualNode, BackfillProgressPerVnode)>) -> Self { +impl From> for BackfillState { + fn from(v: Vec<(VirtualNode, BackfillStatePerVnode)>) -> Self { Self { inner: v.into_iter().collect(), } @@ -100,11 +222,13 @@ impl From> for BackfillState { } /// Used for tracking backfill state per vnode +/// The `OwnedRow` only contains the pk of upstream, to track `current_pos`. #[derive(Clone, Eq, PartialEq, Debug)] pub enum BackfillProgressPerVnode { + /// no entry exists for a vnode, or on initialization of the executor. NotStarted, InProgress(OwnedRow), - Completed, + Completed(OwnedRow), } pub(crate) fn mark_chunk( @@ -155,11 +279,14 @@ pub(crate) fn mark_chunk_ref_by_vnode( // I will revisit it again when arrangement_backfill is implemented e2e. let vnode = VirtualNode::compute_row(row, pk_in_output_indices); let v = match backfill_state.get_progress(&vnode)? { - BackfillProgressPerVnode::Completed => true, + // We want to just forward the row, if the vnode has finished backfill. + BackfillProgressPerVnode::Completed(_) => true, + // If not started, no need to forward. BackfillProgressPerVnode::NotStarted => false, + // If in progress, we need to check row <= current_pos. BackfillProgressPerVnode::InProgress(current_pos) => { let lhs = row.project(pk_in_output_indices); - let rhs = current_pos.project(pk_in_output_indices); + let rhs = current_pos; let order = cmp_datum_iter(lhs.iter(), rhs.iter(), pk_order.iter().copied()); match order { Ordering::Less | Ordering::Equal => true, @@ -274,10 +401,10 @@ pub(crate) fn mapping_message(msg: Message, upstream_indices: &[usize]) -> Optio } } -/// Gets progress per vnode, so we know which to backfill. +/// Recovers progress per vnode, so we know which to backfill. pub(crate) async fn get_progress_per_vnode( state_table: &StateTableInner, -) -> StreamExecutorResult> { +) -> StreamExecutorResult> { debug_assert!(!state_table.vnode_bitmap().is_empty()); let vnodes = state_table.vnodes().iter_vnodes(); let mut result = Vec::with_capacity(state_table.vnodes().len()); @@ -292,19 +419,36 @@ pub(crate) async fn get_progress_per_vnode { - let vnode_is_finished = row.last().unwrap(); - if vnode_is_finished.into_bool() { - BackfillProgressPerVnode::Completed + let vnode_is_finished = row.as_inner().get(row.len() - 2).unwrap(); + let vnode_is_finished = vnode_is_finished.as_ref().unwrap(); + + // Only the current pos should be contained in the in-memory backfill state. + // Row count will be added later. + let current_pos = row.as_inner().get(..row.len() - 2).unwrap(); + let current_pos = current_pos.into_owned_row(); + if *vnode_is_finished.as_bool() { + BackfillStatePerVnode::new( + BackfillProgressPerVnode::Completed(current_pos.clone()), + BackfillProgressPerVnode::Completed(current_pos), + ) } else { - BackfillProgressPerVnode::InProgress(row) + BackfillStatePerVnode::new( + BackfillProgressPerVnode::InProgress(current_pos.clone()), + BackfillProgressPerVnode::InProgress(current_pos), + ) } } - None => BackfillProgressPerVnode::NotStarted, + None => BackfillStatePerVnode::new( + BackfillProgressPerVnode::NotStarted, + BackfillProgressPerVnode::NotStarted, + ), }; result.push((vnode, backfill_progress)); } + assert_eq!(result.len(), state_table.vnodes().count_ones()); Ok(result) } @@ -345,18 +489,6 @@ pub(crate) async fn flush_data( table.commit(epoch).await } -/// We want to avoid allocating a row for every vnode. -pub(crate) fn build_temporary_state_with_vnode( - row_state: &mut [Datum], - vnode: VirtualNode, - is_finished: bool, - current_pos: &OwnedRow, -) { - row_state[1..current_pos.len() + 1].clone_from_slice(current_pos.as_inner()); - row_state[current_pos.len() + 1] = Some(is_finished.into()); - row_state[0] = Some(vnode.to_scalar().into()); -} - /// We want to avoid allocating a row for every vnode. /// Instead we can just modify a single row, and dispatch it to state table to write. /// This builds the following segments of the row: @@ -380,9 +512,11 @@ pub(crate) fn update_pos_by_vnode( chunk: &StreamChunk, pk_in_output_indices: &[usize], backfill_state: &mut BackfillState, -) { +) -> StreamExecutorResult<()> { let new_pos = get_new_pos(chunk, pk_in_output_indices); - backfill_state.update_progress(vnode, BackfillProgressPerVnode::InProgress(new_pos)); + assert_eq!(new_pos.len(), pk_in_output_indices.len()); + backfill_state.update_progress(vnode, new_pos)?; + Ok(()) } /// Get new backfill pos from the chunk. Since chunk should have ordered rows, we can just take the @@ -452,11 +586,12 @@ where } } -#[try_stream(ok = Option, error = StreamExecutorError)] -pub(crate) async fn iter_chunks<'a, S, E>(mut iter: S, builder: &'a mut DataChunkBuilder) +#[try_stream(ok = StreamChunk, error = StreamExecutorError)] +pub(crate) async fn iter_chunks<'a, S, E, R>(mut iter: S, builder: &'a mut DataChunkBuilder) where StreamExecutorError: From, - S: Stream> + Unpin + 'a, + R: Row, + S: Stream> + Unpin + 'a, { while let Some(data_chunk) = collect_data_chunk_with_builder(&mut iter, builder) .instrument_await("backfill_snapshot_read") @@ -465,63 +600,95 @@ where debug_assert!(data_chunk.cardinality() > 0); let ops = vec![Op::Insert; data_chunk.capacity()]; let stream_chunk = StreamChunk::from_parts(ops, data_chunk); - yield Some(stream_chunk); + yield stream_chunk; } - - yield None; } /// Schema /// | vnode | pk | `backfill_finished` | -/// Persists the state per vnode. -/// 1. For each (`vnode`, `current_pos`), -/// Either insert if no old state, -/// Or update the state if have old state. +/// Persists the state per vnode based on `BackfillState`. +/// We track the current committed state via `committed_progress` +/// so we know whether we need to persist the state or not. +/// +/// The state is encoded as follows: +/// `NotStarted`: +/// - Not persist to store at all. +/// +/// `InProgress`: +/// - Format: | vnode | pk | false | +/// - If change in current pos: Persist. +/// - No change in current pos: Do not persist. +/// +/// Completed +/// - Format: | vnode | pk | true | +/// - If previous state is `InProgress` / `NotStarted`: Persist. +/// - If previous state is Completed: Do not persist. +/// TODO(kwannoel): we should check committed state to be all `finished` in the tests. +/// TODO(kwannoel): Instead of persisting state per vnode each time, +/// we can optimize by persisting state for a subset of vnodes which were updated. pub(crate) async fn persist_state_per_vnode( epoch: EpochPair, table: &mut StateTableInner, - is_finished: bool, - backfill_state: &BackfillState, - committed_progress: &mut HashMap>, - temporary_state: &mut [Datum], + backfill_state: &mut BackfillState, + #[cfg(debug_assertions)] state_len: usize, + vnodes: impl Iterator, ) -> StreamExecutorResult<()> { - // No progress -> No need to commit anything. - if backfill_state.has_no_progress() { - table.commit_no_data_expected(epoch); - } - - for (vnode, backfill_progress) in backfill_state.iter_backfill_progress() { - let current_pos = match backfill_progress { - BackfillProgressPerVnode::Completed | BackfillProgressPerVnode::NotStarted => { - continue; - } - BackfillProgressPerVnode::InProgress(current_pos) => current_pos, - }; - build_temporary_state_with_vnode(temporary_state, *vnode, is_finished, current_pos); - - let old_state = committed_progress.get(vnode); - - if let Some(old_state) = old_state { - // No progress for vnode, means no data - if old_state == current_pos.as_inner() { - table.commit_no_data_expected(epoch); - return Ok(()); - } else { - // There's some progress, update the state. - table.write_record(Record::Update { - old_row: &old_state[..], - new_row: &(*temporary_state), - }); - table.commit(epoch).await?; + let mut has_progress = false; + for vnode in vnodes { + if !backfill_state.need_commit(&vnode) { + continue; + } + let (encoded_prev_state, encoded_current_state) = + match backfill_state.get_commit_state(&vnode) { + Some((old_state, new_state)) => (old_state, new_state), + None => continue, + }; + if let Some(encoded_prev_state) = encoded_prev_state { + // There's some progress, update the state. + #[cfg(debug_assertions)] + { + let pk: &[Datum; 1] = &[Some(vnode.to_scalar().into())]; + // old_row only contains the value segment. + let old_row = table.get_row(pk).await?; + match old_row { + Some(old_row) => { + let inner = old_row.as_inner(); + // value segment (without vnode) should be used for comparison + assert_eq!(inner, &encoded_prev_state[1..]); + assert_ne!(inner, &encoded_current_state[1..]); + assert_eq!(old_row.len(), state_len - 1); + assert_eq!(encoded_current_state.len(), state_len); + } + None => { + panic!("row {:#?} not found", pk); + } + } } + table.write_record(Record::Update { + old_row: &encoded_prev_state[..], + new_row: &encoded_current_state[..], + }); + has_progress = true; } else { // No existing state, create a new entry. + #[cfg(debug_assertions)] + { + let pk: &[Datum; 1] = &[Some(vnode.to_scalar().into())]; + let row = table.get_row(pk).await?; + assert!(row.is_none(), "row {:#?}", row); + assert_eq!(encoded_current_state.len(), state_len); + } table.write_record(Record::Insert { - new_row: &(*temporary_state), + new_row: &encoded_current_state[..], }); - table.commit(epoch).await?; + has_progress = true; } - committed_progress.insert(*vnode, current_pos.as_inner().to_vec()); + backfill_state.mark_committed(vnode); + } + if has_progress { + table.commit(epoch).await?; + } else { + table.commit_no_data_expected(epoch); } Ok(()) } @@ -550,3 +717,26 @@ pub(crate) async fn persist_state( } Ok(()) } + +/// Creates a data chunk builder for snapshot read. +/// If the `rate_limit` is smaller than `chunk_size`, it will take precedence. +/// This is so we can partition snapshot read into smaller chunks than chunk size. +pub fn create_builder( + rate_limit: Option, + chunk_size: usize, + data_types: Vec, +) -> DataChunkBuilder { + if let Some(rate_limit) = rate_limit + && rate_limit < chunk_size + { + DataChunkBuilder::new( + data_types, + rate_limit, + ) + } else { + DataChunkBuilder::new( + data_types, + chunk_size, + ) + } +} diff --git a/src/stream/src/executor/hash_agg.rs b/src/stream/src/executor/hash_agg.rs index 1478534771738..1fdde9083e15a 100644 --- a/src/stream/src/executor/hash_agg.rs +++ b/src/stream/src/executor/hash_agg.rs @@ -49,7 +49,6 @@ use crate::common::StreamChunkBuilder; use crate::error::StreamResult; use crate::executor::aggregation::AggGroup as GenericAggGroup; use crate::executor::error::StreamExecutorError; -use crate::executor::monitor::StreamingMetrics; use crate::executor::{BoxedMessageStream, Executor, Message}; use crate::task::AtomicU64Ref; @@ -138,8 +137,6 @@ struct ExecutorInner { /// Should emit on window close according to watermark? emit_on_window_close: bool, - - metrics: Arc, } impl ExecutorInner { @@ -250,7 +247,6 @@ impl HashAggExecutor { chunk_size: args.extra.chunk_size, max_dirty_groups_heap_size: args.extra.max_dirty_groups_heap_size, emit_on_window_close: args.extra.emit_on_window_close, - metrics: args.metrics, }, }) } @@ -399,7 +395,6 @@ impl HashAggExecutor { visibilities, &mut this.distinct_dedup_tables, agg_group.group_key(), - this.actor_ctx.clone(), ) .await?; for ((call, storage), visibility) in (this.agg_calls.iter()) @@ -423,11 +418,13 @@ impl HashAggExecutor { let actor_id_str = this.actor_ctx.id.to_string(); let fragment_id_str = this.actor_ctx.fragment_id.to_string(); let table_id_str = this.intermediate_state_table.table_id().to_string(); - this.metrics + this.actor_ctx + .streaming_metrics .agg_dirty_groups_count .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .set(vars.dirty_groups.len() as i64); - this.metrics + this.actor_ctx + .streaming_metrics .agg_dirty_groups_heap_size .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .set(vars.dirty_groups.estimated_heap_size() as i64); @@ -526,8 +523,7 @@ impl HashAggExecutor { } // Flush distinct dedup state. - vars.distinct_dedup - .flush(&mut this.distinct_dedup_tables, this.actor_ctx.clone())?; + vars.distinct_dedup.flush(&mut this.distinct_dedup_tables)?; // Evict cache to target capacity. vars.agg_group_cache.evict(); @@ -537,23 +533,28 @@ impl HashAggExecutor { let actor_id_str = this.actor_ctx.id.to_string(); let fragment_id_str = this.actor_ctx.fragment_id.to_string(); let table_id_str = this.intermediate_state_table.table_id().to_string(); - this.metrics + this.actor_ctx + .streaming_metrics .agg_lookup_miss_count .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc_by(std::mem::take(&mut vars.stats.lookup_miss_count)); - this.metrics + this.actor_ctx + .streaming_metrics .agg_total_lookup_count .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc_by(std::mem::take(&mut vars.stats.total_lookup_count)); - this.metrics + this.actor_ctx + .streaming_metrics .agg_cached_entry_count .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .set(vars.agg_group_cache.len() as i64); - this.metrics + this.actor_ctx + .streaming_metrics .agg_chunk_lookup_miss_count .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc_by(std::mem::take(&mut vars.stats.chunk_lookup_miss_count)); - this.metrics + this.actor_ctx + .streaming_metrics .agg_chunk_total_lookup_count .with_label_values(&[&table_id_str, &actor_id_str, &fragment_id_str]) .inc_by(std::mem::take(&mut vars.stats.chunk_total_lookup_count)); @@ -591,7 +592,7 @@ impl HashAggExecutor { let window_col_idx = this.group_key_indices[window_col_idx_in_group_key]; let agg_group_cache_metrics_info = MetricsInfo::new( - this.metrics.clone(), + this.actor_ctx.streaming_metrics.clone(), this.intermediate_state_table.table_id(), this.actor_ctx.id, "agg intermediate state table", @@ -607,10 +608,9 @@ impl HashAggExecutor { dirty_groups: Default::default(), distinct_dedup: DistinctDeduplicater::new( &this.agg_calls, - &this.watermark_epoch, + this.watermark_epoch.clone(), &this.distinct_dedup_tables, - this.actor_ctx.id, - this.metrics.clone(), + this.actor_ctx.clone(), ), buffered_watermarks: vec![None; this.group_key_indices.len()], window_watermark: None, diff --git a/src/stream/src/executor/mod.rs b/src/stream/src/executor/mod.rs index 004b47559d8f7..04754d71807bb 100644 --- a/src/stream/src/executor/mod.rs +++ b/src/stream/src/executor/mod.rs @@ -106,6 +106,7 @@ mod utils; pub use actor::{Actor, ActorContext, ActorContextRef}; use anyhow::Context; +pub use backfill::arrangement_backfill::*; pub use backfill::cdc::{CdcBackfillExecutor, ExternalStorageTable}; pub use backfill::no_shuffle_backfill::*; pub use barrier_recv::BarrierRecvExecutor; @@ -844,6 +845,11 @@ pub async fn expect_first_barrier( let barrier = message .into_barrier() .expect("the first message must be a barrier"); + // TODO: Is this check correct? + assert!(matches!( + barrier.kind, + BarrierKind::Checkpoint | BarrierKind::Initial + )); Ok(barrier) } diff --git a/src/stream/src/executor/simple_agg.rs b/src/stream/src/executor/simple_agg.rs index 6623a05854e0b..0d33a7dc3074e 100644 --- a/src/stream/src/executor/simple_agg.rs +++ b/src/stream/src/executor/simple_agg.rs @@ -25,7 +25,6 @@ use super::agg_common::{AggExecutorArgs, SimpleAggExecutorExtraArgs}; use super::aggregation::{ agg_call_filter_res, iter_table_storage, AggStateStorage, AlwaysOutput, DistinctDeduplicater, }; -use super::monitor::StreamingMetrics; use super::*; use crate::common::table::state_table::StateTable; use crate::error::StreamResult; @@ -91,8 +90,6 @@ struct ExecutorInner { /// Extreme state cache size extreme_cache_size: usize, - - metrics: Arc, } impl ExecutorInner { @@ -151,7 +148,6 @@ impl SimpleAggExecutor { distinct_dedup_tables: args.distinct_dedup_tables, watermark_epoch: args.watermark_epoch, extreme_cache_size: args.extreme_cache_size, - metrics: args.metrics, }, }) } @@ -182,7 +178,6 @@ impl SimpleAggExecutor { call_visibilities, &mut this.distinct_dedup_tables, None, - this.actor_ctx.clone(), ) .await?; @@ -212,8 +207,7 @@ impl SimpleAggExecutor { ) -> StreamExecutorResult> { let chunk = if vars.state_changed || vars.agg_group.is_uninitialized() { // Flush distinct dedup state. - vars.distinct_dedup - .flush(&mut this.distinct_dedup_tables, this.actor_ctx.clone())?; + vars.distinct_dedup.flush(&mut this.distinct_dedup_tables)?; // Flush states into intermediate state table. let encoded_states = vars.agg_group.encode_states(&this.agg_funcs)?; @@ -266,10 +260,9 @@ impl SimpleAggExecutor { let mut distinct_dedup = DistinctDeduplicater::new( &this.agg_calls, - &this.watermark_epoch, + this.watermark_epoch.clone(), &this.distinct_dedup_tables, - this.actor_ctx.id, - this.metrics.clone(), + this.actor_ctx.clone(), ); distinct_dedup.dedup_caches_mut().for_each(|cache| { cache.update_epoch(barrier.epoch.curr); diff --git a/src/stream/src/executor/test_utils.rs b/src/stream/src/executor/test_utils.rs index 9e5f7ed036b19..9547744443f48 100644 --- a/src/stream/src/executor/test_utils.rs +++ b/src/stream/src/executor/test_utils.rs @@ -293,7 +293,6 @@ pub mod agg_executor { AggExecutorArgs, HashAggExecutorExtraArgs, SimpleAggExecutorExtraArgs, }; use crate::executor::aggregation::AggStateStorage; - use crate::executor::monitor::StreamingMetrics; use crate::executor::{ ActorContext, ActorContextRef, BoxedExecutor, Executor, ExecutorInfo, HashAggExecutor, PkIndices, SimpleAggExecutor, @@ -498,7 +497,6 @@ pub mod agg_executor { intermediate_state_table, distinct_dedup_tables: Default::default(), watermark_epoch: Arc::new(AtomicU64::new(0)), - metrics: Arc::new(StreamingMetrics::unused()), extra: HashAggExecutorExtraArgs { group_key_indices, @@ -569,7 +567,6 @@ pub mod agg_executor { intermediate_state_table, distinct_dedup_tables: Default::default(), watermark_epoch: Arc::new(AtomicU64::new(0)), - metrics: Arc::new(StreamingMetrics::unused()), extra: SimpleAggExecutorExtraArgs {}, }) .unwrap() diff --git a/src/stream/src/from_proto/hash_agg.rs b/src/stream/src/from_proto/hash_agg.rs index 7b4c70a592417..b79913da18269 100644 --- a/src/stream/src/from_proto/hash_agg.rs +++ b/src/stream/src/from_proto/hash_agg.rs @@ -110,7 +110,6 @@ impl ExecutorBuilder for HashAggExecutorBuilder { intermediate_state_table, distinct_dedup_tables, watermark_epoch: stream.get_watermark_epoch(), - metrics: params.executor_stats, extra: HashAggExecutorExtraArgs { group_key_indices, chunk_size: params.env.config().developer.chunk_size, diff --git a/src/stream/src/from_proto/simple_agg.rs b/src/stream/src/from_proto/simple_agg.rs index a61cf375ae50b..61a5937aa4092 100644 --- a/src/stream/src/from_proto/simple_agg.rs +++ b/src/stream/src/from_proto/simple_agg.rs @@ -71,7 +71,6 @@ impl ExecutorBuilder for SimpleAggExecutorBuilder { intermediate_state_table, distinct_dedup_tables, watermark_epoch: stream.get_watermark_epoch(), - metrics: params.executor_stats, extra: SimpleAggExecutorExtraArgs {}, })? .boxed()) diff --git a/src/stream/src/from_proto/stream_scan.rs b/src/stream/src/from_proto/stream_scan.rs index f6f35b33f601b..a5dc8abaf7bcb 100644 --- a/src/stream/src/from_proto/stream_scan.rs +++ b/src/stream/src/from_proto/stream_scan.rs @@ -16,15 +16,18 @@ use std::sync::Arc; use risingwave_common::catalog::{ColumnDesc, ColumnId, TableId, TableOption}; use risingwave_common::util::sort_util::OrderType; +use risingwave_common::util::value_encoding::column_aware_row_encoding::ColumnAwareSerde; +use risingwave_common::util::value_encoding::BasicSerde; use risingwave_pb::plan_common::StorageTableDesc; use risingwave_pb::stream_plan::{StreamScanNode, StreamScanType}; use risingwave_storage::table::batch_table::storage_table::StorageTable; use risingwave_storage::table::Distribution; use super::*; -use crate::common::table::state_table::StateTable; +use crate::common::table::state_table::{ReplicatedStateTable, StateTable}; use crate::executor::{ - BackfillExecutor, ChainExecutor, FlowControlExecutor, RearrangedChainExecutor, + ArrangementBackfillExecutor, BackfillExecutor, ChainExecutor, FlowControlExecutor, + RearrangedChainExecutor, }; pub struct StreamScanExecutorBuilder; @@ -58,6 +61,7 @@ impl ExecutorBuilder for StreamScanExecutorBuilder { StreamScanType::Rearrange => { RearrangedChainExecutor::new(params.info, snapshot, upstream, progress).boxed() } + StreamScanType::Backfill => { let table_desc: &StorageTableDesc = node.get_table_desc()?; let table_id = TableId { @@ -116,6 +120,16 @@ impl ExecutorBuilder for StreamScanExecutorBuilder { .collect_vec(); let prefix_hint_len = table_desc.get_read_prefix_len_hint() as usize; let versioned = table_desc.versioned; + + let state_table = if let Ok(table) = node.get_state_table() { + Some( + StateTable::from_table_catalog(table, state_store.clone(), vnodes.clone()) + .await, + ) + } else { + None + }; + // TODO: refactor it with from_table_catalog in the future. let upstream_table = StorageTable::new_partial( state_store.clone(), @@ -130,11 +144,6 @@ impl ExecutorBuilder for StreamScanExecutorBuilder { prefix_hint_len, versioned, ); - let state_table = if let Ok(table) = node.get_state_table() { - Some(StateTable::from_table_catalog(table, state_store, vnodes).await) - } else { - None - }; BackfillExecutor::new( params.info, @@ -149,6 +158,56 @@ impl ExecutorBuilder for StreamScanExecutorBuilder { ) .boxed() } + StreamScanType::ArrangementBackfill => { + let column_ids = node + .upstream_column_ids + .iter() + .map(ColumnId::from) + .collect_vec(); + + let vnodes = params.vnode_bitmap.map(Arc::new); + + let state_table = node.get_state_table().unwrap(); + let state_table = StateTable::from_table_catalog( + state_table, + state_store.clone(), + vnodes.clone(), + ) + .await; + + let upstream_table = node.get_arrangement_table().unwrap(); + let versioned = upstream_table.get_version().is_ok(); + + macro_rules! new_executor { + ($SD:ident) => {{ + let upstream_table = + ReplicatedStateTable::<_, $SD>::from_table_catalog_with_output_column_ids( + upstream_table, + state_store.clone(), + vnodes, + column_ids, + ) + .await; + ArrangementBackfillExecutor::<_, $SD>::new( + params.info, + upstream_table, + upstream, + state_table, + output_indices, + progress, + stream.streaming_metrics.clone(), + params.env.config().developer.chunk_size, + node.rate_limit.map(|x| x as _), + ) + .boxed() + }}; + } + if versioned { + new_executor!(ColumnAwareSerde) + } else { + new_executor!(BasicSerde) + } + } StreamScanType::Unspecified => unreachable!(), }; Ok(FlowControlExecutor::new( diff --git a/src/tests/compaction_test/src/delete_range_runner.rs b/src/tests/compaction_test/src/delete_range_runner.rs index 5982fe818403b..d2acd7c754c74 100644 --- a/src/tests/compaction_test/src/delete_range_runner.rs +++ b/src/tests/compaction_test/src/delete_range_runner.rs @@ -63,7 +63,6 @@ use risingwave_storage::store::{ use risingwave_storage::StateStore; use crate::CompactionTestOpts; - pub fn start_delete_range(opts: CompactionTestOpts) -> Pin + Send>> { // WARNING: don't change the function signature. Making it `async fn` will cause // slow compile in release mode. @@ -435,13 +434,10 @@ impl NormalState { .get( TableKey(Bytes::copy_from_slice(key)), ReadOptions { - prefix_hint: None, ignore_range_tombstone, - retention_seconds: None, table_id: self.table_id, - read_version_from_backup: false, - prefetch_options: Default::default(), cache_policy: CachePolicy::Fill(CachePriority::High), + ..Default::default() }, ) .await @@ -462,13 +458,12 @@ impl NormalState { Bound::Excluded(TableKey(Bytes::copy_from_slice(right))), ), ReadOptions { - prefix_hint: None, ignore_range_tombstone, - retention_seconds: None, table_id: self.table_id, read_version_from_backup: false, prefetch_options: PrefetchOptions::default(), cache_policy: CachePolicy::Fill(CachePriority::High), + ..Default::default() }, ) .await @@ -494,13 +489,12 @@ impl CheckState for NormalState { Bound::Excluded(Bytes::copy_from_slice(right)).map(TableKey), ), ReadOptions { - prefix_hint: None, ignore_range_tombstone: true, - retention_seconds: None, table_id: self.table_id, read_version_from_backup: false, prefetch_options: PrefetchOptions::default(), cache_policy: CachePolicy::Fill(CachePriority::High), + ..Default::default() }, ) .await diff --git a/src/tests/simulation/tests/integration_tests/recovery/background_ddl.rs b/src/tests/simulation/tests/integration_tests/recovery/background_ddl.rs index 3d03aeb4067d4..4e1ef135f839c 100644 --- a/src/tests/simulation/tests/integration_tests/recovery/background_ddl.rs +++ b/src/tests/simulation/tests/integration_tests/recovery/background_ddl.rs @@ -73,7 +73,7 @@ async fn cancel_stream_jobs(session: &mut Session) -> Result> { tracing::info!("cancelling streaming jobs"); let ids = ids.split('\n').collect::>().join(","); let result = session.run(&format!("cancel jobs {};", ids)).await?; - tracing::info!("cancelled streaming jobs, {:#?}", result); + tracing::info!("cancelled streaming jobs, {}", result); let ids = result .split('\n') .map(|s| s.parse::().unwrap()) @@ -195,7 +195,7 @@ async fn test_ddl_cancel() -> Result<()> { session.run(CREATE_TABLE).await?; session.run(SEED_TABLE_500).await?; session.flush().await?; - session.run(SET_RATE_LIMIT_2).await?; + session.run(SET_RATE_LIMIT_1).await?; session.run(SET_BACKGROUND_DDL).await?; for _ in 0..5 { @@ -369,3 +369,38 @@ async fn test_sink_create() -> Result<()> { Ok(()) } + +#[tokio::test] +async fn test_background_agg_mv_recovery() -> Result<()> { + init_logger(); + let mut cluster = Cluster::start(Configuration::for_background_ddl()).await?; + let mut session = cluster.start_session(); + + session.run("CREATE TABLE t1 (v1 int)").await?; + session + .run("INSERT INTO t1 SELECT generate_series FROM generate_series(1, 200);") + .await?; + session.flush().await?; + session.run(SET_RATE_LIMIT_1).await?; + session.run(SET_BACKGROUND_DDL).await?; + session + .run("CREATE MATERIALIZED VIEW mv1 as select v1, count(*) from t1 group by v1;") + .await?; + sleep(Duration::from_secs(2)).await; + + kill_cn_and_meta_and_wait_recover(&cluster).await; + + // Now just wait for it to complete. + session.run(WAIT).await?; + + let t_count = session.run("SELECT COUNT(v1) FROM t1").await?; + let mv1_count = session.run("SELECT COUNT(v1) FROM mv1").await?; + assert_eq!(t_count, mv1_count); + + // Make sure that if MV killed and restarted + // it will not be dropped. + session.run("DROP MATERIALIZED VIEW mv1;").await?; + session.run("DROP TABLE t1;").await?; + + Ok(()) +} diff --git a/src/tests/sqlsmith/scripts/extract_queries.sh b/src/tests/sqlsmith/scripts/extract_queries.sh index ed9d5c9ee1d3f..9abac600296aa 100755 --- a/src/tests/sqlsmith/scripts/extract_queries.sh +++ b/src/tests/sqlsmith/scripts/extract_queries.sh @@ -15,7 +15,7 @@ SHRUNK_OUTPUT_FILE="$2".shrunk echo "--- Extracting queries" cat "$LOG_FILE" | rg "\[EXECUTING .*\]" | sed 's/.*\[EXECUTING .*\]: //' | sed 's/$/;/' > "$OUTPUT_FILE" -echo "--- Extracted queries to $LOG_FILE" +echo "--- Extracted queries to $OUTPUT_FILE" echo "--- Shrinking queries" cargo run --bin sqlsmith-reducer -- --input-file "$OUTPUT_FILE" --output-file "$SHRUNK_OUTPUT_FILE"