Merge branch 'develop'

great-expectations · Aug 19, 2019 · 37ed5d6 · 37ed5d6
2 parents 178a511 + 89138b6
commit 37ed5d6
Showing 68 changed files with 2,688 additions and 653 deletions.
diff --git a/docs/core_concepts/custom_expectations.rst b/docs/core_concepts/custom_expectations.rst
@@ -185,4 +185,44 @@ A similar approach works for the command-line tool.
         dataset_class=custom_dataset.CustomPandasDataset
 
 
+Using custom expectations with a DataSource
+--------------------------------------------------------------------------------
+
+To use custom expectations in a datasource or DataContext, you need to define the custom DataAsset in the datasource
+configuration or batch_kwargs for a specific batch. Following the same example above, let's suppose you've defined
+`CustomPandasDataset` in a module called `custom_dataset.py`. You can configure your datasource to return instances
+of your custom DataAsset type by passing in a :ref:`ClassConfig` that describes your source.
+
+If you are working a DataContext, simply placing `custom_dataset.py` in your configured plugin directory will make it
+accessible, otherwise, you need to ensure the module is on the import path.
+
+Once you do this, all the functionality of your new expectations will be available for use. For example, you could use
+the datasource snippet below to configure a PandasDatasource that will produce instances of your new
+CustomPandasDataset in a DataContext.
+
+.. code-block:: yaml
+
+    datasources:
+      my_datasource:
+        type: pandas  # class_name: PandasDatasource
+        data_asset_type:
+          module_name: custom_dataset
+          class_name: CustomPandasDataset
+        generators:
+          default:
+            type: subdir_reader  # class_name: SubdirReaderGenerator
+            base_directory: /data
+            reader_options:
+              sep: \t
+
+.. code-block:: bash
+
+    >> import great_expectations as ge
+    >> context = ge.DataContext()
+    >> my_df = context.get_batch("my_datasource/default/my_file")
 
+    >> my_df.expect_column_values_to_equal_1("all_twos")
+    {
+        "success": False,
+        "unexpected_list": [2,2,2,2,2,2,2,2]
+    }
diff --git a/docs/core_concepts/data_context.rst b/docs/core_concepts/data_context.rst
@@ -11,13 +11,13 @@ as well as managed expectation suites should be stored in version control.
 
 DataContexts use data sources you're already familiar with. Generators help introspect data stores and data execution
 frameworks (such as airflow, Nifi, dbt, or dagster) to describe and produce batches of data ready for analysis. This
-enables fetching, validation, profiling, and documentation of  your data in a way that is meaningful within your
+enables fetching, validation, profiling, and documentation of your data in a way that is meaningful within your
 existing infrastructure and work environment.
 
 DataContexts use a datasource-based namespace, where each accessible type of data has a three-part
 normalized *data_asset_name*, consisting of *datasource/generator/generator_asset*.
 
-- The datasource actually connects to a source of materialized data and returns Great Expectations DataAssets \
+- The datasource actually connects to a source of data and returns Great Expectations DataAssets \
   connected to a compute environment and ready for validation.
 
 - The Generator knows how to introspect datasources and produce identifying "batch_kwargs" that define \

diff --git a/docs/core_concepts/datasource.rst b/docs/core_concepts/datasource.rst
@@ -3,12 +3,10 @@
 Datasources
 ============
 
-Datasources are responsible for connecting to data infrastructure. Each Datasource is a source
-of materialized data, such as a SQL database, S3 bucket, or local file directory.
-
-Each Datasource also provides access to Great Expectations data assets that are connected to
-a specific compute environment, such as a SQL database, a Spark cluster, or a local in-memory
-Pandas DataFrame.
+Datasources are responsible for connecting data and compute infrastructure. Each Datasource provides
+Great Expectations DataAssets (or batches in a DataContext) connected to a specific compute environment, such as a
+SQL database, a Spark cluster, or a local in-memory Pandas DataFrame. Datasources know how to access data from
+relevant sources such as an existing object from a DAG runner, a SQL database, S3 bucket, or local filesystem.
 
 To bridge the gap between those worlds, Datasources interact closely with *generators* which
 are aware of a source of data and can produce produce identifying information, called
@@ -23,6 +21,9 @@ a SqlAlchemyDataset corresponding to that batch of data and ready for validation
 Since opinionated DAG managers such as airflow, dbt, prefect.io, dagster can also act as datasources
 and/or generators for a more generic datasource.
 
+When adding custom expectations by subclassing an existing DataAsset type, use the data_asset_type parameter
+to configure the datasource to load and return DataAssets of the custom type.
+
 See :ref:`batch_generator` for more detail about how batch generators interact with datasources and DAG runners.
 
 See datasource module docs :ref:`datasource_module` for more detail about available datasources.

diff --git a/docs/getting_started/cli_init.rst b/docs/getting_started/cli_init.rst
@@ -120,7 +120,7 @@ Datasources allow you to configure connections to data to evaluate Expectations.
 2. Relational databases via SQL Alchemy
 3. Spark DataFrames
 
-Therefore, a Datasource could be a local pandas environment with some configuration to parse CSV files from a directory; a connection to postgresql instance; a Spark cluster connected to an S3 bucket; etc. In the future, we plan to add support for other compute environments, such as dask and BigQuery. (If you'd like to use or contribute to those environments, please chime in on `GitHub issues <https://github.com/great-expectations/great_expectations/issues>`_.)
+Therefore, a Datasource could be a local pandas environment with some configuration to parse CSV files from a directory; a connection to postgresql instance; a Spark cluster connected to an S3 bucket; etc. In the future, we plan to add support for other compute environments, such as dask. (If you'd like to use or contribute to those environments, please chime in on `GitHub issues <https://github.com/great-expectations/great_expectations/issues>`_.)
 
 Our example project has a ``data/`` folder containing several CSVs. Within the CLI, we can configure a Pandas DataFrame Datasource like so:
 

diff --git a/docs/guides/batch_generator.rst b/docs/guides/batch_generator.rst
@@ -13,8 +13,11 @@ the Events table with a timestamp on February 7, 2012," which a SqlAlchemyDataso
 could use to materialize a SqlAlchemyDataset corresponding to that batch of data and
 ready for validation.
 
-A batch is a sample from a data asset, sliced according to a particular rule. For
-example, an hourly slide of the Events table or “most recent `users` records.”
+Batch
+------
+
+A batch is a sample from a data asset, sliced according to a particular rule.
+For example, an hourly slide of the Events table or “most recent `users` records.”
 
 A Batch is the primary unit of validation in the Great Expectations DataContext.
 Batches include metadata that identifies how they were constructed--the same “batch_kwargs”

diff --git a/docs/guides/data_documentation.rst b/docs/guides/data_documentation.rst
@@ -27,14 +27,13 @@ And then detailed statistics for each column:
 .. image:: ../images/movie_db_profiling_screenshot_1.jpg
 
 
-There are three use cases for using documentation in a data project:
+The GE DataContext uses a configurable "data documentation site" to define which artifacts to compile and how to render them as documentation. Multiple sites can be configured inside a project, each suitable for a particular data documentation use case.
 
-1. Visualize all Great Expectations artifacts in the local repo of my project as HTML: expectation suites, validation results and profiling results.
+For example, we have identified three common use cases for using documentation in a data project. They are to:
 
+1. Visualize all Great Expectations artifacts in the local repo of a project as HTML: expectation suites, validation results and profiling results.
 2. Maintain a "shared source of truth" for a team working on a data project. This documentation renders all the artifacts committed in the source control system (expectation suites and profiling results) and a continuously updating data quality report, built from a chronological list of validations by run id.
-
-3. Share a spec of a dataset with a client or a partner. This is similar to an API documentaiton in software development. This documentation would include profiling results of the dataset to give the reader a quick way to grasp what the data looks like, and one or more expectation suites that encode what is expected from the data to be considered valid.
-
+3. Share a spec of a dataset with a client or a partner. This is similar to API documentation in software development. This documentation would include profiling results of the dataset to give the reader a quick way to grasp what the data looks like, and one or more expectation suites that encode what is expected from the data to be considered valid.
 
 To support these (and possibly other) use cases GE has a concept of "data documentation site". Multiple sites can be configured inside a project, each suitable for a particular data documentation use case.
 
@@ -52,7 +51,10 @@ Users can specify
 * where the HTML files should be written (filesystem or S3)
 * which renderer and view class should be used to render each section
 
-Here is an example of a site configuration:
+Data Documentation Site Configuration
+*************************************
+
+Here is an example of a site configuration from great_expectations.yml:
 
 .. code-block:: bash
 
@@ -67,7 +69,7 @@ Here is an example of a site configuration:
             type: filesystem
             base_directory: uncommitted/validations/
             run_id_filter:
-              ne: profiling
+              ne: profiling # exclude validations with run id "profiling" - reserved for profiling results
           profiling_store: # where to look for profiling results (filesystem/S3)
             type: filesystem
             base_directory: uncommitted/validations/
@@ -105,11 +107,19 @@ Here is an example of a site configuration:
                 module: great_expectations.render.view
                 class: DefaultJinjaPageView
 
+* ``validations_store`` and ``profiling_store`` in the example above specify the location of validation and profiling results that the site will include in the documentation. The store's ``type`` can be ``filesystem`` or ``s3`` (S3 store is not currently implemented, but will be supported in the near future.) ``base_directory`` must be specified for ``filesystem`` stores. The optional ``run_id_filter`` attribute allows to include (``eq`` for exact match) or exclude (``ne``) validation results with a particular run id.
+
+
+
+Adjusting Data Documentation For Your Project's Needs
+*****************************************************
 
 By default, GE creates two data documentation sites for a new project:
 
-1. "local_site" renders documentation for all the datasources in the project from GE artifacts in the local repo. The site includes expectation suites and profiling and validation results from `uncommitted` directory. Local site provides the convenience of visualizing all the entities stored in JSON files as HTML.
-2. "team_site" is meant to support the "shared source of truth for a team" use case. By default only the expectations section is enabled. Users have to configure the profiling and the validations sections (and the corresponding validations_store and profiling_store attributes based on the team's decisions where these are stored (a local filesystem or S3). Reach out on `Slack <https://tinyurl.com/great-expectations-slack>`__ if you would like to discuss the best way to configure a team site.
+1. "local_site" renders documentation for all the datasources in the project from GE artifacts in the local filesystem. The site includes expectation suites and profiling and validation results from the `uncommitted` directory. Local site provides the convenience of visualizing all the entities stored in JSON files as HTML.
+2. "team_site" is meant to support the "shared source of truth for a team" use case. By default only the expectations section is enabled. Users have to configure the profiling and the validations sections (and the corresponding validations_store and profiling_store attributes) based on the team's decisions about where these are stored.) Reach out on `Slack <https://tinyurl.com/great-expectations-slack>`__ if you would like to discuss the best way to configure a team site.
+
+Users have full control over configuring Data Documentation for their project - they can modify the two pre-configured sites (or remove them altogether) and add new sites with a configuration that meets the project's needs. The easiest way to add a new site to the configuration is to copy the "local_site" configuration block in great_expectations.yml, give the copy a new name and modify the details as needed.
 
 How to build documentation
 ----------------------------
@@ -172,3 +182,13 @@ for how to profile a single batch of data and build documentation from the valid
   safe_mmkdir(os.path.dirname(profiling_html_filepath))
   with open(profiling_html_filepath, 'w') as writer:
       writer.write(DefaultJinjaPageView.render(document_model))
+
+
+Dependencies
+~~~~~~~~~~~~
+* Font Awesome 5.10.1
+* Bootstrap 4.3.1
+* jQuery 3.2.1
+* Vega 5.3.5
+* Vega-Lite 3.2.1
+* Vega-Embed 4.0.0
diff --git a/docs/roadmap_and_changelog/changelog.rst b/docs/roadmap_and_changelog/changelog.rst
@@ -1,6 +1,32 @@
 .. _changelog:
 
 
+v.0.7.7
+-----------------
+* Standardize the way that plugin module loading works. DataContext will begin to use the new-style class and plugin
+identification moving forward; yml configs should specify class_name and module_name (with module_name optional for
+GE types). For now, it is possible to use the "type" parameter in configuration (as before).
+* Add support for custom data_asset_type to all datasources
+* Add support for strict_min and strict_max to inequality-based expectations to allow strict inequality checks
+(thanks @RoyalTS!)
+* Add support for reader_method = "delta" to SparkDFDatasource
+* Fix databricks generator (thanks @sspitz3!)
+* Improve performance of DataContext loading by moving optional import
+* Fix several memory and performance issues in SparkDFDataset.
+ - Use only distinct value count instead of bringing values to driver
+ - Migrate away from UDF for set membership, nullity, and regex expectations
+* Fix several UI issues in the data_documentation
+ - Move prescriptive dataset expectations to Overview section
+ - Fix broken link on Home breadcrumb
+ - Scroll follows navigation properly
+ - Improved flow for long items in value_set
+ - Improved testing for ValidationRenderer
+ - Clarify dependencies introduced in documentation sites
+ - Improve testing and documentation for site_builder, including run_id filter
+ - Fix missing header in Index page and cut-off tooltip
+ - Add run_id to path for validation files
+
+
 v.0.7.6
 -----------------
 * New Validation Renderer! Supports turning validation results into HTML and displays differences between the expected \

diff --git a/great_expectations/cli/cli.py b/great_expectations/cli/cli.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 from .datasource import (
-    add_datasource,
+    add_datasource as add_datasource_impl,
     profile_datasource,
     build_documentation as build_documentation_impl,
     msg_go_to_notebook
@@ -195,7 +195,7 @@ def init(target_directory):
         "\nDone.",
     )
 
-    data_source_name = add_datasource(context)
+    data_source_name = add_datasource_impl(context)
 
     if not data_source_name: # no datasource was created
         return
@@ -204,6 +204,26 @@ def init(target_directory):
 
     cli_message(msg_go_to_notebook)
 
+@cli.command()
+@click.option('--directory', '-d', default="./great_expectations",
+              help='The root of a project directory containing a great_expectations/ config.')
+def add_datasource(directory):
+    """Add a new datasource to the data context
+    """
+    try:
+        context = DataContext(directory)
+    except ConfigNotFoundError:
+        cli_message("Error: no great_expectations context configuration found in the specified directory.")
+        return
+
+
+    data_source_name = add_datasource_impl(context)
+
+    if not data_source_name: # no datasource was created
+        return
+
+    profile_datasource(context, data_source_name)
+
 
 @cli.command()
 @click.argument('datasource_name', default=None, required=False)

diff --git a/great_expectations/cli/datasource.py b/great_expectations/cli/datasource.py
@@ -302,7 +302,7 @@ def profile_datasource(context, data_source_name, data_assets=None, profile_all_
 
     else:
         cli_message(
-            "Okay, skipping HTML documentation for now.`."
+            "Okay, skipping HTML documentation for now."
         )
 
 

diff --git a/great_expectations/data_asset/data_asset.py b/great_expectations/data_asset/data_asset.py
@@ -959,7 +959,31 @@ def validate(self,
             warnings.warn(
                 "WARNING: No great_expectations version found in configuration object.")
 
-        for expectation in expectation_suite['expectations']:
+
+
+        ###
+        # This is an early example of what will become part of the ValidationOperator
+        # This operator would be dataset-semantic aware
+        # Adding now to simply ensure we can be slightly better at ordering our expectation evaluation
+        ###
+
+        # Group expectations by column
+        columns = {}
+
+        for expectation in expectation_suite["expectations"]:
+            if "column" in expectation["kwargs"]:
+                column = expectation["kwargs"]["column"]
+            else:
+                column = "_nocolumn"
+            if column not in columns:
+                columns[column] = []
+            columns[column].append(expectation)
+
+        expectations_to_evaluate = []
+        for col in columns:
+            expectations_to_evaluate.extend(columns[col])
+
+        for expectation in expectations_to_evaluate:
 
             try:
                 expectation_method = getattr(

diff --git a/great_expectations/data_context/data_context.py b/great_expectations/data_context/data_context.py
@@ -11,12 +11,10 @@
 from six import string_types
 import datetime
 import shutil
-from collections import OrderedDict
 
 from .util import NormalizedDataAssetName, get_slack_callback, safe_mmkdir
 
 from great_expectations.exceptions import DataContextError, ConfigNotFoundError, ProfilerError
-from great_expectations.render.types import RenderedDocumentContent
 
 from great_expectations.render.renderer.site_builder import SiteBuilder
 
@@ -34,19 +32,6 @@
     DBTDatasource
 )
 from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler
-from great_expectations.render.renderer import ProfilingResultsPageRenderer, ExpectationSuitePageRenderer, ValidationResultsPageRenderer
-from great_expectations.render.view import (
-    DefaultJinjaPageView,
-    DefaultJinjaIndexPageView,
-)
-from great_expectations.render.types import (
-    RenderedComponentContent,
-    RenderedSectionContent,
-)
-
-
-
-from .expectation_explorer import ExpectationExplorer
 
 logger = logging.getLogger(__name__)
 yaml = YAML()
@@ -142,6 +127,7 @@ def __init__(self, context_root_dir=None, expectation_explorer=False, data_asset
         self._expectation_explorer = expectation_explorer
         self._datasources = {}
         if expectation_explorer:
+            from great_expectations.jupyter_ux.expectation_explorer import ExpectationExplorer
             self._expectation_explorer_manager = ExpectationExplorer()
 
         # determine the "context root directory" - this is the parent of "great_expectations" dir

diff --git a/great_expectations/data_context/types/__init__.py b/great_expectations/data_context/types/__init__.py
diff --git a/great_expectations/data_context/util.py b/great_expectations/data_context/util.py
@@ -6,6 +6,7 @@
 import json
 import errno
 from collections import namedtuple
+import six
 
 logger = logging.getLogger(__name__)
 
@@ -121,6 +122,11 @@ def send_slack_notification(validation_json=None):
 
 def safe_mmkdir(directory, exist_ok=True):
     """Simple wrapper since exist_ok is not available in python 2"""
+    if not isinstance(directory, six.string_types):
+        raise TypeError("directory must be of type str, not {0}".format({
+            "directory_type": str(type(directory))
+        }))
+
     if not exist_ok:
         raise ValueError(
             "This wrapper should only be used for exist_ok=True; it is designed to make porting easier later")