diff --git a/tests/integration/.coveragerc b/.coveragerc similarity index 100% rename from tests/integration/.coveragerc rename to .coveragerc diff --git a/.github/workflows/acceptance.yml b/.github/workflows/acceptance.yml index f56e229..c346a93 100644 --- a/.github/workflows/acceptance.yml +++ b/.github/workflows/acceptance.yml @@ -45,6 +45,13 @@ jobs: - name: Run unit tests and generate test coverage report run: make test + # Integration tests are run from within tests/integration folder. + # We need to make sure .coveragerc is there so that code coverage is generated for the right modules. + - name: Prepare code coverage configuration for integration tests + run: cp .coveragerc tests/integration + + # Run tests from `tests/integration` as defined in .codegen.json + # and generate code coverage for modules defined in .coveragerc - name: Run integration tests and generate test coverage report uses: databrickslabs/sandbox/acceptance@acceptance/v0.4.3 with: diff --git a/.github/workflows/docs-release.yml b/.github/workflows/docs-release.yml index 26c32b1..41fdd56 100644 --- a/.github/workflows/docs-release.yml +++ b/.github/workflows/docs-release.yml @@ -4,7 +4,7 @@ on: push: tags: - 'v[0-9]+.[0-9]+.[0-9]+' # Must match semantic version tags like 'v1.2.3' - workflow_dispatch: # Enables manual triggering of the workflow + workflow_dispatch: # Allows manual triggering of the workflow jobs: build: diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml new file mode 100644 index 0000000..416039e --- /dev/null +++ b/.github/workflows/nightly.yml @@ -0,0 +1,62 @@ +name: nightly + +on: + workflow_dispatch: # Allows manual triggering of the workflow + schedule: + - cron: '0 4 * * *' # Runs automatically at 4:00 AM UTC every day + +permissions: + id-token: write + issues: write + contents: read + pull-requests: read + +concurrency: + group: single-acceptance-job-per-repo + +jobs: + integration: + environment: tool + runs-on: larger + steps: + - name: Checkout Code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install Python + uses: actions/setup-python@v5 + with: + cache: 'pip' + cache-dependency-path: '**/pyproject.toml' + python-version: '3.10' + + - name: Install hatch + run: pip install hatch==1.9.4 + + - name: Run unit tests and generate test coverage report + run: make test + + # Acceptance tests are run from within tests/integration folder. + # We need to make sure .coveragerc is there so that code coverage is generated for the right modules. + - name: Prepare .coveragerc for integration tests + run: cp .coveragerc tests/integration + + # Run tests from `tests/integration` as defined in .codegen.json + # and generate code coverage for modules defined in .coveragerc + - name: Run integration tests and generate test coverage report + uses: databrickslabs/sandbox/acceptance@acceptance/v0.4.3 + with: + vault_uri: ${{ secrets.VAULT_URI }} + timeout: 2h + create_issues: true + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ARM_CLIENT_ID: ${{ secrets.ARM_CLIENT_ID }} + ARM_TENANT_ID: ${{ secrets.ARM_TENANT_ID }} + + # collects all coverage reports: coverage.xml from integration tests, coverage-unit.xml from unit tests + - name: Publish test coverage + uses: codecov/codecov-action@v5 + with: + use_oidc: true diff --git a/Makefile b/Makefile index e9ec38a..176e619 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,7 @@ setup_spark_remote: test: setup_spark_remote ci-test coverage: - hatch run coverage && open htmlcov/index.html + hatch run coverage; open htmlcov/index.html docs-build: yarn --cwd docs/dqx build diff --git a/demos/dqx_demo_tool.py b/demos/dqx_demo_tool.py index 3821bd0..78cc3c5 100644 --- a/demos/dqx_demo_tool.py +++ b/demos/dqx_demo_tool.py @@ -227,6 +227,6 @@ from databricks.labs.dqx.contexts.workspace import WorkspaceContext ctx = WorkspaceContext(WorkspaceClient()) -dashboards_folder_link = f"{ctx.installation.workspace_link("")}dashboards/" +dashboards_folder_link = f"{ctx.installation.workspace_link('')}dashboards/" print(f"Open a dashboard from the following folder and refresh it:") print(dashboards_folder_link) \ No newline at end of file diff --git a/docs/dqx/docs/demos.mdx b/docs/dqx/docs/demos.mdx index 3e8926b..696f898 100644 --- a/docs/dqx/docs/demos.mdx +++ b/docs/dqx/docs/demos.mdx @@ -8,3 +8,6 @@ Install the [installation](/docs/installation) framework, and import the followi * [DQX Demo Notebook (library)](https://github.com/databrickslabs/dqx/blob/main/demos/dqx_demo_library.py) - demonstrates how to use DQX as a library. * [DQX Demo Notebook (tool)](https://github.com/databrickslabs/dqx/blob/main/demos/dqx_demo_tool.py) - demonstrates how to use DQX as a tool when installed in the workspace. * [DQX DLT Demo Notebook](https://github.com/databrickslabs/dqx/blob/main/demos/dqx_dlt_demo.py) - demonstrates how to use DQX with Delta Live Tables (DLT). + +Note that DQX don't have to be run from a Notebook. You can run it from any Python script as long as it runs on Databricks. +For example, you can add DQX as a library to your job or cluster. \ No newline at end of file diff --git a/docs/dqx/docs/dev/contributing.mdx b/docs/dqx/docs/dev/contributing.mdx index cfd18b0..02ee0ae 100644 --- a/docs/dqx/docs/dev/contributing.mdx +++ b/docs/dqx/docs/dev/contributing.mdx @@ -19,49 +19,43 @@ While minimizing external dependencies is essential, exceptions can be made case justified, such as when a well-established and actively maintained library provides significant benefits, like time savings, performance improvements, or specialized functionality unavailable in standard libraries. -## Common fixes for `mypy` errors - -See https://mypy.readthedocs.io/en/stable/cheat_sheet_py3.html for more details - -### ..., expression has type "None", variable has type "str" - -* Add `assert ... is not None` if it's a body of a method. Example: - -``` -# error: Argument 1 to "delete" of "DashboardWidgetsAPI" has incompatible type "str | None"; expected "str" -self._ws.dashboard_widgets.delete(widget.id) -``` - -after - -``` -assert widget.id is not None -self._ws.dashboard_widgets.delete(widget.id) -``` - -* Add `... | None` if it's in the dataclass. Example: `cloud: str = None` -> `cloud: str | None = None` - -### ..., has incompatible type "Path"; expected "str" - -Add `.as_posix()` to convert Path to str - -### Argument 2 to "get" of "dict" has incompatible type "None"; expected ... +## First contribution -Add a valid default value for the dictionary return. +If you're interested in contributing, please create a PR, reach out to us or open an issue to discuss your ideas. -Example: -```python -def viz_type(self) -> str: - return self.viz.get("type", None) -``` +Here are the example steps to submit your first contribution: -after: +1. Fork the repo. You can also create a branch if you are added as writer to the repo. +2. The locally: `git clone` +3. `git checkout main` (or `gcm` if you're using [ohmyzsh](https://ohmyz.sh/)). +4. `git pull` (or `gl` if you're using [ohmyzsh](https://ohmyz.sh/)). +5. `git checkout -b FEATURENAME` (or `gcb FEATURENAME` if you're using [ohmyzsh](https://ohmyz.sh/)). +6. .. do the work +7. `make fmt` +8. `make lint` +9. .. fix if any issues reported +10. `make setup_spark_remote`, `make test` and `make integration`, and optionally `make coverage` (generate coverage report) +11. .. fix if any issues reported +12. `git commit -S -a -m "message"` + + Make sure to enter a meaningful commit message title. + You need to sign commits with your GPG key (hence -S option). + To setup GPG key in your Github account follow [these instructions](https://docs.github.com/en/github/authenticating-to-github/managing-commit-signature-verification). + You can configure Git to sign all commits with your GPG key by default: `git config --global commit.gpgsign true` + + If you have not signed your commits initially, you can re-apply all of them and sign as follows: + ```shell + git reset --soft HEAD~ + git commit -S --reuse-message=ORIG_HEAD + git push -f origin + ``` +13. `git push origin FEATURENAME` -Example: -```python -def viz_type(self) -> str: - return self.viz.get("type", "UNKNOWN") -``` + To access the repository, you must use the HTTPS remote with a personal access token or SSH with an SSH key and passphrase that has been authorized for `databrickslabs` organization. +14. Go to GitHub UI and create PR. Alternatively, `gh pr create` (if you have [GitHub CLI](https://cli.github.com/) installed). + Use a meaningful pull request title because it'll appear in the release notes. Use `Resolves #NUMBER` in pull + request description to [automatically link it](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/using-keywords-in-issues-and-pull-requests#linking-a-pull-request-to-an-issue) + to an existing issue. ## Local Setup @@ -98,9 +92,9 @@ The command `make setup_spark_remote` sets up the environment for running unit t DQX uses Databricks Connect as a test dependency, which restricts the creation of a Spark session in local mode. To enable spark local execution for unit testing, the command install spark remote. -### Local setup for integration tests and code coverage +### Running integration tests and code coverage -Note that integration tests and code coverage are run automatically when you create a Pull Request in Github. +Integration tests and code coverage are run automatically when you create a Pull Request in Github. You can also trigger the tests from a local machine by configuring authentication to a Databricks workspace. You can use any Unity Catalog enabled Databricks workspace. @@ -171,12 +165,29 @@ To run integration tests on serverless compute, add the `DATABRICKS_SERVERLESS_C } } ``` -When `DATABRICKS_SERVERLESS_COMPUTE_ID` is set the `DATABRICKS_CLUSTER_ID` is ignored, and tests will run on serverless compute. +When `DATABRICKS_SERVERLESS_COMPUTE_ID` is set the `DATABRICKS_CLUSTER_ID` is ignored, and tests run on serverless compute. -## Running CLI from the local repo +## Manual testing of the framework + +We require that all changes be covered by unit tests and integration tests. A pull request (PR) will be blocked if the code coverage is negatively impacted by the proposed change. +However, manual testing may still be useful before creating or merging a PR. + +To test DQX from your feature branch, you can install it directly as follows: +```commandline +pip install git+https://github.com/databrickslabs/dqx.git@feature_barnch_name +``` + +Replace `feature_branch_name` with the name of your branch. + +## Manual testing of the CLI commands from the current codebase Once you clone the repo locally and install Databricks CLI you can run labs CLI commands from the root of the repository. -Similar to other databricks cli commands we can specify profile to use with `--profile`. +Similar to other databricks cli commands we can specify Databricks profile to use with `--profile`. + +Build the project: +```commandline +make dev +``` Authenticate your current machine to your Databricks Workspace: ```commandline @@ -190,6 +201,7 @@ databricks labs show . Install dqx: ```commandline +# use the current codebase databricks labs install . ``` @@ -203,43 +215,72 @@ Uninstall DQX: databricks labs uninstall dqx ``` -## First contribution +## Manual testing of the CLI commands from a pre-release version -If you're interested in contributing, please reach out to us or open an issue to discuss your ideas. -To contribute, you need to be added as a writer to the repository. -Please note that we currently do not accept external contributors. +In most cases, installing DQX directly from the current codebase is sufficient to test CLI commands. However, this approach may not be ideal in some cases because the CLI would use the current development virtual environment. +When DQX is installed from a released version, it creates a fresh and isolated Python virtual environment locally and installs all the required packages, ensuring a clean setup. +If you need to perform end-to-end testing of the CLI before an official release, follow the process outlined below. -Here are the example steps to submit your first contribution: +Note: This is only available for GitHub accounts that have write access to the repository. If you contribute from a fork this method is not available. -1. Make a branch in the dqx repo -2. `git clone` -3. `git checkout main` (or `gcm` if you're using [ohmyzsh](https://ohmyz.sh/)). -4. `git pull` (or `gl` if you're using [ohmyzsh](https://ohmyz.sh/)). -5. `git checkout -b FEATURENAME` (or `gcb FEATURENAME` if you're using [ohmyzsh](https://ohmyz.sh/)). -6. .. do the work -7. `make fmt` -8. `make lint` -9. .. fix if any -10. `make setup_spark_remote`, make test` and `make integration`, optionally `make coverage` to get test coverage report -11. .. fix if any issues -12. `git commit -S -a -m "message"`. -Make sure to enter a meaningful commit message title. -You need to sign commits with your GPG key (hence -S option). -To setup GPG key in your Github account follow [these instructions](https://docs.github.com/en/github/authenticating-to-github/managing-commit-signature-verification). -You can configure Git to sign all commits with your GPG key by default: `git config --global commit.gpgsign true` -13. `git push origin FEATURENAME` -14. Go to GitHub UI and create PR. Alternatively, `gh pr create` (if you have [GitHub CLI](https://cli.github.com/) installed). - Use a meaningful pull request title because it'll appear in the release notes. Use `Resolves #NUMBER` in pull - request description to [automatically link it](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/using-keywords-in-issues-and-pull-requests#linking-a-pull-request-to-an-issue) - to an existing issue. +```commandline +# create new tag +git tag v0.1.12-alpha -If you have not signed your commits initially, you can re-apply all of them and sign as follows: -```shell -git reset --soft HEAD~ -git commit -S --reuse-message=ORIG_HEAD -git push -f origin +# push the tag +git push origin v0.1.12-alpha + +# specify the tag (pre-release version) +databricks labs install dqx@v0.1.12-alpha ``` +The release pipeline only triggers when a valid semantic version is provided (e.g. v0.1.12). +Pre-release versions (e.g. v0.1.12-alpha) do not trigger the release pipeline, allowing you to test changes safely before making an official release. + ## Troubleshooting If you encounter any package dependency errors after `git pull`, run `make clean` + +### Common fixes for `mypy` errors + +See https://mypy.readthedocs.io/en/stable/cheat_sheet_py3.html for more details + +**..., expression has type "None", variable has type "str"** + +* Add `assert ... is not None` if it's a body of a method. Example: + +``` +# error: Argument 1 to "delete" of "DashboardWidgetsAPI" has incompatible type "str | None"; expected "str" +self._ws.dashboard_widgets.delete(widget.id) +``` + +after + +``` +assert widget.id is not None +self._ws.dashboard_widgets.delete(widget.id) +``` + +* Add `... | None` if it's in the dataclass. Example: `cloud: str = None` -> `cloud: str | None = None` + +**..., has incompatible type "Path"; expected "str"** + +Add `.as_posix()` to convert Path to str + +**Argument 2 to "get" of "dict" has incompatible type "None"; expected ...** + +Add a valid default value for the dictionary return. + +Example: +```python +def viz_type(self) -> str: + return self.viz.get("type", None) +``` + +after: + +Example: +```python +def viz_type(self) -> str: + return self.viz.get("type", "UNKNOWN") +``` \ No newline at end of file diff --git a/docs/dqx/docs/guide.mdx b/docs/dqx/docs/guide.mdx index 0b19450..7e45dbf 100644 --- a/docs/dqx/docs/guide.mdx +++ b/docs/dqx/docs/guide.mdx @@ -419,20 +419,22 @@ Discover the full list of available data quality rules and learn how to define y To perform data quality checking with DQX, you need to create `DQEngine` object. The engine requires a Databricks workspace client for authentication and interaction with the Databricks workspace. -When running the code on a Databricks workspace (e.g. in a notebook or as a job), the workspace client is automatically authenticated. -For external environments (e.g. CI servers or local machines), you can authenticate using any method supported by the Databricks SDK. Detailed instructions are available in the [default authentication flow](https://databricks-sdk-py.readthedocs.io/en/latest/authentication.html#default-authentication-flow). - -If you use Databricks [configuration profiles](https://docs.databricks.com/dev-tools/auth.html#configuration-profiles) or Databricks-specific [environment variables](https://docs.databricks.com/dev-tools/auth.html#environment-variables) for authentication, you only need the following code to create a workspace client: +When running the code on a Databricks workspace, the workspace client is automatically authenticated, whether DQX is used in a notebook, script, or as part of a job/workflow. +You only need the following code to create the workspace client if you run DQX on Databricks workspace: ```python from databricks.sdk import WorkspaceClient from databricks.labs.dqx.engine import DQEngine ws = WorkspaceClient() - -# use the workspace client to create the DQX engine dq_engine = DQEngine(ws) ``` +For external environments, such as CI servers or local machines, you can authenticate to Databricks using any method supported by the Databricks SDK. For detailed instructions, refer to the [default authentication flow](https://databricks-sdk-py.readthedocs.io/en/latest/authentication.html#default-authentication-flow). +If you're using Databricks [configuration profiles](https://docs.databricks.com/dev-tools/auth.html#configuration-profiles) or Databricks-specific [environment variables](https://docs.databricks.com/dev-tools/auth.html#environment-variables) for authentication, you can easily create the workspace client without needing to provide additional arguments: +```python +ws = WorkspaceClient() +``` + For details on the specific methods available in the engine, visit to the [reference](/docs/reference#dq-engine-methods) section. Information on testing applications that use `DQEngine` can be found [here](/docs/reference#testing-applications-using-dqx). diff --git a/docs/dqx/docs/installation.mdx b/docs/dqx/docs/installation.mdx index c7e9ec0..1f632fa 100644 --- a/docs/dqx/docs/installation.mdx +++ b/docs/dqx/docs/installation.mdx @@ -10,17 +10,21 @@ The framework can be installed on a Databricks workspace or used as a standalone - Python 3.10 or later. See [instructions](https://www.python.org/downloads/). - Unity Catalog-enabled [Databricks workspace](https://docs.databricks.com/en/getting-started/index.html). - Network access to your Databricks Workspace used for the installation process. -- (Optional) Databricks CLI v0.213 or later. See [instructions](https://docs.databricks.com/dev-tools/cli/databricks-cli.html). +- (Optional) Databricks CLI v0.241 or later. See [instructions](https://docs.databricks.com/dev-tools/cli/databricks-cli.html). - Databricks Runtime with Spark 3.5.0 or higher. See [instructions](https://docs.databricks.com/clusters/create.html). ## DQX installation as a Library Install the project via `pip`: - ```commandline pip install databricks-labs-dqx ``` +Install a specific version of the project via `pip` (e.g. version 0.1.12): +```commandline +pip install databricks-labs-dqx==0.1.12 +``` + ## DQX installation in a Databricks Workspace If you choose to install DQX via PyPI and use it purely as a library, you don’t need to pre-install DQX in the workspace. @@ -40,11 +44,15 @@ More about authentication options [here](https://docs.databricks.com/en/dev-tool ### Install DQX Install DQX in your Databricks workspace via Databricks CLI: - ```commandline databricks labs install dqx ``` +Install a specific version of DQX in your Databricks workspace via Databricks CLI (e.g. version 0.1.12): +```commandline +databricks labs install dqx@v0.1.12 +``` + You'll be prompted to select a [configuration profile](https://docs.databricks.com/en/dev-tools/auth.html#databricks-client-unified-authentication) created by `databricks auth login` command, and other configuration options. diff --git a/docs/dqx/docs/reference.mdx b/docs/dqx/docs/reference.mdx index b7f8c24..f6907a6 100644 --- a/docs/dqx/docs/reference.mdx +++ b/docs/dqx/docs/reference.mdx @@ -239,7 +239,7 @@ To run the integration tests on serverless compute, add the `DATABRICKS_SERVERLE } } ``` -When `DATABRICKS_SERVERLESS_COMPUTE_ID` is set the `DATABRICKS_CLUSTER_ID` is ignored, and tests will run on serverless compute. +When `DATABRICKS_SERVERLESS_COMPUTE_ID` is set the `DATABRICKS_CLUSTER_ID` is ignored, and tests run on serverless compute. ### Local testing with DQEngine diff --git a/labs.yml b/labs.yml index 24d6e2b..a5419ef 100644 --- a/labs.yml +++ b/labs.yml @@ -3,6 +3,7 @@ name: dqx description: Data Quality Framework for PySpark Workloads install: script: src/databricks/labs/dqx/installer/install.py + extras: cli uninstall: script: src/databricks/labs/dqx/installer/uninstall.py entrypoint: src/databricks/labs/dqx/cli.py diff --git a/pyproject.toml b/pyproject.toml index c3310da..32c449a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,9 +30,11 @@ classifiers = [ dependencies = ["databricks-labs-blueprint>=0.9.1,<0.10", "databricks-sdk~=0.30", "databricks-labs-lsql>=0.5,<0.15", - "pyspark~=3.5.0" # required by the CLI to work correctly ] +[project.optional-dependencies] +cli = ["pyspark~=3.5.0"] + [project.entry-points.databricks] runtime = "databricks.labs.dqx.runtime:main" @@ -70,6 +72,7 @@ dependencies = [ "types-PyYAML~=6.0.12", "types-requests~=2.31.0", "databricks-connect~=15.4", + "pyspark~=3.5.0", ] python="3.10" @@ -79,22 +82,22 @@ path = ".venv" [tool.hatch.envs.default.scripts] test = "pytest tests/unit/ -n 10 --cov --cov-report=xml:coverage-unit.xml --timeout 30 --durations 20" -coverage = "pytest tests/ -n 10 --cov --cov-report=html --timeout 480 --durations 20" -integration = "pytest tests/integration/ -n 10 --cov --cov-report=xml --cov-config=tests/integration/.coveragerc --timeout 480 --durations 20" -fmt = ["black . --extend-exclude 'demos/'", +coverage = "pytest tests/ -n 10 --cov --cov-report=html --timeout 600 --durations 20" +integration = "pytest tests/integration/ -n 10 --cov --cov-report=xml --timeout 600 --durations 20" +fmt = ["black .", "ruff check . --fix", - "mypy . --exclude 'demos/*'", + "mypy .", "pylint --output-format=colorized -j 0 src tests"] -verify = ["black --check . --extend-exclude 'demos/'", +verify = ["black --check .", "ruff check .", - "mypy . --exclude 'demos/*'", + "mypy .", "pylint --output-format=colorized -j 0 src tests"] [tool.isort] profile = "black" [tool.mypy] -exclude = ['venv', '.venv'] +exclude = ['venv', '.venv', 'demos/*'] [tool.pytest.ini_options] addopts = "--no-header" @@ -105,6 +108,7 @@ filterwarnings = ["ignore::DeprecationWarning"] target-version = ["py310"] line-length = 120 skip-string-normalization = true +extend-exclude = 'demos/' [tool.ruff] cache-dir = ".venv/ruff-cache" diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 8798900..6cb5614 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,6 +1,5 @@ import os import logging -import threading from collections.abc import Callable, Generator from functools import cached_property @@ -25,7 +24,6 @@ logging.getLogger("databricks.labs.dqx").setLevel("DEBUG") logger = logging.getLogger(__name__) -_lock = threading.Lock() @pytest.fixture diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 8d71b85..8df0f21 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,7 +1,4 @@ -import tempfile -import os import pyspark.sql.functions as F -from pyspark.sql.types import Row import pytest from databricks.labs.dqx.utils import read_input_data, get_column_name @@ -30,20 +27,6 @@ def test_get_col_name_longer(): assert actual == "local" -def test_read_input_data_storage_path(spark_local): - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - temp_file.write(b"val1,val2\n") - temp_file_path = temp_file.name - - try: - input_location = temp_file_path - result = read_input_data(spark_local, input_location, "csv") - assert result.collect() == [Row(_c0='val1', _c1='val2')] - - finally: - os.remove(temp_file_path) - - def test_read_input_data_no_input_location(spark_local): with pytest.raises(ValueError, match="Input location not configured"): read_input_data(spark_local, None, None)