diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d70a5b69c..52a42dd72 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: additional_dependencies: [ pyupgrade==2.7.3 ] args: [ --nbqa-mutate, --py36-plus ] - repo: https://github.com/asottile/pyupgrade - rev: v2.9.0 + rev: v2.10.0 hooks: - id: pyupgrade args: ['--py36-plus','--exit-zero-even-if-changed'] diff --git a/README.md b/README.md index f1b9a9133..81878e8e3 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,7 @@ For each column the following statistics - if relevant for the column type - are ## Announcements -**Version v2.10.1 released**: containing stability fixes for the previous release, which included a major overhaul of the type system, now fully reliant on visions. -See the changelog below to know what has changed. +**Version v2.11.0 released** featuring an exciting integration with Great Expectations that many of you requested (see details below). **Spark backend in progress**: We can happily announce that we're nearing v1 for the Spark backend for generating profile reports. Stay tuned. @@ -52,18 +51,18 @@ It's extra exciting that GitHub **matches your contribution** for the first year Find more information here: - - [Changelog v2.10.1](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/changelog.html#changelog-v2-10-1) + - [Changelog v2.11.0](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/changelog.html#changelog-v2-11-0) - [Sponsor the project on GitHub](https://github.com/sponsors/sbrugman) -_February 7, 2021 πŸ’˜_ +_February 20, 2021 πŸ’˜_ --- _Contents:_ **[Examples](#examples)** | **[Installation](#installation)** | **[Documentation](#documentation)** | **[Large datasets](#large-datasets)** | **[Command line usage](#command-line-usage)** | -**[Advanced usage](#advanced-usage)** | **[Support](#supporting-open-source)** | -**[Types](#types)** | **[How to contribute](#contributing)** | +**[Advanced usage](#advanced-usage)** | **[integrations](#integrations)** | +**[Support](#supporting-open-source)** | **[Types](#types)** | **[How to contribute](#contributing)** | **[Editor Integration](#editor-integration)** | **[Dependencies](#dependencies)** --- @@ -238,16 +237,43 @@ A set of options is available in order to adapt the report generated. * `title` (`str`): Title for the report ('Pandas Profiling Report' by default). * `pool_size` (`int`): Number of workers in thread pool. When set to zero, it is set to the number of CPUs available (0 by default). * `progress_bar` (`bool`): If True, `pandas-profiling` will display a progress bar. +* `infer_dtypes` (`bool`): When `True` (default) the `dtype` of variables are inferred using `visions` using the typeset logic (for instance a column that has integers stored as string will be analyzed as if being numeric). More settings can be found in the [default configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_default.yaml), [minimal configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_minimal.yaml) and [dark themed configuration file](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_dark.yaml). -**Example** +You find the configuration docs on the advanced usage page [here](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/advanced_usage.html) +**Example** ```python profile = df.profile_report(title='Pandas Profiling Report', plot={'histogram': {'bins': 8}}) profile.to_file("output.html") ``` +## Integrations + +### Great Expectations + + + + + + +
+ +Great Expectations + + + +Profiling your data is closely related to data validation: often validation rules are defined in terms of well-known statistics. +For that purpose, `pandas-profiling` integrates with [Great Expectations](https://www.greatexpectations.io>). +This a world-class open-source library that helps you to maintain data quality and improve communication about data between teams. +Great Expectations allows you to create Expectations (which are basically unit tests for your data) and Data Docs (conveniently shareable HTML data reports). +`pandas-profiling` features a method to create a suite of Expectations based on the results of your ProfileReport, which you can store, and use to validate another (or future) dataset. + +You can find more details on the Great Expectations integration [here](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/great_expectations_integration.html) + +
+ ## Supporting open source Maintaining and developing the open-source code for pandas-profiling, with millions of downloads and thousands of users, would not be possible without support of our gracious sponsors. @@ -269,7 +295,7 @@ Maintaining and developing the open-source code for pandas-profiling, with milli We would like to thank our generous Github Sponsors supporters who make pandas-profiling possible: - Martin Sotir, Joseph Yuen, Brian Lee, Stephanie Rivera, nscsekhar, abdulAziz + Martin Sotir, Brian Lee, Stephanie Rivera, abdulAziz, gramster More info if you would like to appear here: [Github Sponsor page](https://github.com/sponsors/sbrugman) @@ -277,7 +303,7 @@ More info if you would like to appear here: [Github Sponsor page](https://github ## Types Types are a powerful abstraction for effective data analysis, that goes beyond the logical data types (integer, float etc.). -`pandas-profiling` currently recognizes the following types: _Boolean, Numerical, Date, Categorical, URL, Path, File_ and _Image_. +`pandas-profiling` currently, recognizes the following types: _Boolean, Numerical, Date, Categorical, URL, Path, File_ and _Image_. We have developed a type system for Python, tailored for data analysis: [visions](https://github.com/dylan-profiler/visions). Selecting the right typeset drastically reduces the complexity the code of your analysis. diff --git a/docsrc/source/index.rst b/docsrc/source/index.rst index 9272e8490..f54a96d2f 100644 --- a/docsrc/source/index.rst +++ b/docsrc/source/index.rst @@ -15,6 +15,7 @@ pages/sensitive_data pages/metadata pages/integrations + pages/great_expectations_integration pages/changelog .. toctree:: diff --git a/docsrc/source/pages/changelog.rst b/docsrc/source/pages/changelog.rst index 08731a85e..0eca44ba5 100644 --- a/docsrc/source/pages/changelog.rst +++ b/docsrc/source/pages/changelog.rst @@ -2,6 +2,8 @@ Changelog ========= +.. include:: changelog/v2_11_0.rst + .. include:: changelog/v2_10_1.rst .. include:: changelog/v2_10_0.rst diff --git a/docsrc/source/pages/changelog/v2_11_0.rst b/docsrc/source/pages/changelog/v2_11_0.rst new file mode 100644 index 000000000..d4b7aa3e2 --- /dev/null +++ b/docsrc/source/pages/changelog/v2_11_0.rst @@ -0,0 +1,16 @@ +Changelog v2.11.0 +---------------- + +πŸŽ‰ Features +^^^^^^^^^^^ +- Great Expectations integration `[430] `_ `docs `_ (thanks @spbail, @talagluck and the Great Expectations team). +- Introduced the ``infer_dtypes`` parameter to control automatic inference of data types `[676] `_ (thanks @mohith7548 and @ieaves). +- Improved JSON representation for pd.Series, pd.DataFrame, numpy data and Samples. + +🚨 Breaking changes +^^^^^^^^^^^^^^^^^^^ +- Global config setting removed; config resets on report initialization. + +⬆️ Dependencies +^^^^^^^^^^^^^^^^^^ +- Update ``pyupgrade`` to ``2.10.0``. diff --git a/docsrc/source/pages/changelog/v2_12_0.rst b/docsrc/source/pages/changelog/v2_12_0.rst new file mode 100644 index 000000000..6b287f91b --- /dev/null +++ b/docsrc/source/pages/changelog/v2_12_0.rst @@ -0,0 +1,30 @@ +Changelog v2.12.0 +---------------- + +πŸŽ‰ Features +^^^^^^^^^^^ +- + +πŸ› Bug fixes +^^^^^^^^^^^^ +- + +πŸ‘·β€β™‚οΈ Internal Improvements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- + +πŸ“– Documentation +^^^^^^^^^^^^^^^^ +- + +⚠️ Deprecated +^^^^^^^^^^^^^^^^^ +- + +🚨 Breaking changes +^^^^^^^^^^^^^^^^^^^ +- + +⬆️ Dependencies +^^^^^^^^^^^^^^^^^^ +- \ No newline at end of file diff --git a/docsrc/source/pages/great_expectations_integration.rst b/docsrc/source/pages/great_expectations_integration.rst new file mode 100644 index 000000000..e21e3bf61 --- /dev/null +++ b/docsrc/source/pages/great_expectations_integration.rst @@ -0,0 +1,150 @@ +==================================== +Integration with Great Expectations +==================================== + +`Great Expectations `_ is a Python-based open-source library for validating, documenting, and profiling your data. It helps you to maintain data quality and improve communication about data between teams. With Great Expectations, you can assert what you expect from the data you load and transform, and catch data issues quickly – Expectations are basically *unit tests for your data*. Pandas Profiling features a method to create a suite of Expectations based on the results of your ProfileReport! + + +About Great Expectations +------------------------- + +*Expectations* are assertions about your data. In Great Expectations, those assertions are expressed in a declarative language in the form of simple, human-readable Python methods. For example, in order to assert that you want values in a column ``passenger_count`` in your dataset to be integers between 1 and 6, you can say: + + ``expect_column_values_to_be_between(column="passenger_count", min_value=1, max_value=6)`` + +Great Expectations then uses this statement to validate whether the column ``passenger_count`` in a given table is indeed between 1 and 6, and returns a success or failure result. The library currently provides :ref:`several dozen highly expressive built-in Expectations`, and allows you to write custom Expectations. + +Great Expectations renders Expectations to clean, human-readable documentation called *Data Docs*. These HTML docs contain both your Expectation Suites as well as your data validation results each time validation is run – think of it as a continuously updated data quality report. + +For more information about Great Expectations, check out the `Great Expectations documentation `_ and join the `Great Expectations Slack channel ` for help. + + +Creating Expectation Suites with Pandas Profiling +-------------------------------------------------- + +An *Expectation Suite* is simply a set of Expectations. You can create Expectation Suites by writing out individual statements, such as the one above, or by automatically generating them based on profiler results. + +Pandas Profiling provides a simple ``to_expectation_suite()`` method that returns a Great Expectations ``ExpectationSuite`` object which contains a set of Expectations. + +**Pre-requisites**: In order to run the ``to_expectation_suite()`` method, you will need to install Great Expectations: +`pip install great_expectations` + +If you would like to use the additional features such as saving the Suite and building Data Docs, you will also need to configure a Great Expectations Data Context by running ``great_expectations init`` while in your project directory. + +.. code-block:: python + + import pandas as pd + from pandas_profiling import ProfileReport + + df = pd.read_csv("titanic.csv") + + profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True) + + # Obtain an Expectation Suite with a set of default Expectations + # By default, this also profiles the dataset, saves the suite, runs validation, and builds Data Docs + suite = profile.to_expectation_suite() + + +This assumes that the ``great_expectations`` Data Context directory is in the *same path* where you run the script. In order to specify the location of your Data Context, pass it in as an argument: + +.. code-block:: python + + import great_expectations as ge + + data_context = ge.data_context.DataContext(context_root_dir="/Users/panda/code/my_ge_project/") + suite = profile.to_expectation_suite(data_context=data_context) + + +You can also configure each feature individually in the function call: + +.. code-block:: python + + suite = profile.to_expectation_suite( + suite_name="titanic_expectations", + data_context=data_context, + save_suite=False, + run_validation=False, + build_data_docs=False, + handler=handler + ) + +See `the Great Expectations Examples `_ for complete examples. + + +Included Expectation types +-------------------------- + +The ``to_expectation_suite`` method returns a default set of Expectations if Pandas Profiling determines that the assertion holds true for the profiled dataset. +The Expectation types depend on the datatype of a column: + +**All columns** + +* ``expect_column_values_to_not_be_null`` +* ``expect_column_values_to_be_unique`` + +**Numeric columns** + +* ``expect_column_values_to_be_in_type_list`` +* ``expect_column_values_to_be_increasing`` +* ``expect_column_values_to_be_decreasing`` +* ``expect_column_values_to_be_between`` + +**Categorical columns** + +* ``expect_column_values_to_be_in_set`` + +**Datetime columns** + +* ``expect_column_values_to_be_between`` + +**Filename columns** + +* ``expect_file_to_exist`` + + +The default logic is straight forward and can be found here in `expectation_algorithms.py `_. + +Rolling your own Expectation Generation Logic +--------------------------------------------- + +If you would like to profile datasets at scale, your use case might require changing the default expectations logic. +The ``to_expectation_suite`` takes the ``handler`` parameter, which allows you to take full control of the generation process. +Generating expectations takes place in two steps: + +- mapping the detected type of each column to a generator function (that receives the columns' summary statistics); +- generating expectations based on the summary (e.g. ``expect_column_values_to_not_be_null`` if ``summary["n_missing"] == 0``) + +Adding an expectation to columns with constant length can be achieved for instance using this code: + +.. code-block:: python + + def fixed_length(name, summary, batch, *args): + """Add a length expectation to columns with constant length values""" + if summary["min_length"] == summary["max_length"]: + batch.expect_column_value_lengths_to_equal(summary["min_length"]) + return name, summary, batch + + + class MyExpectationHandler(Handler): + def __init__(self, typeset, *args, **kwargs): + mapping = { + Unsupported: [expectation_algorithms.generic_expectations], + Categorical: [expectation_algorithms.categorical_expectations, fixed_length], + Boolean: [expectation_algorithms.categorical_expectations], + Numeric: [expectation_algorithms.numeric_expectations], + URL: [expectation_algorithms.url_expectations], + File: [expectation_algorithms.file_expectations], + Path: [expectation_algorithms.path_expectations], + DateTime: [expectation_algorithms.datetime_expectations], + Image: [expectation_algorithms.image_expectations], + } + super().__init__(mapping, typeset, *args, **kwargs) + + # (initiate report) + + suite = report.to_expectation_suite( + handler=MyExpectationHandler(report.typeset) + ) + +You can automate even more by extending the typeset (by default the ``ProfilingTypeSet``) with semantic data types specific to your company or use case (for instance disease classification in healthcare or currency and IBAN in finance). +For that, you can find details in the `visions `_ documentation. diff --git a/examples/features/great_expectations_example.py b/examples/features/great_expectations_example.py new file mode 100644 index 000000000..4cda2c9d3 --- /dev/null +++ b/examples/features/great_expectations_example.py @@ -0,0 +1,68 @@ +import great_expectations as ge +import pandas as pd + +from pandas_profiling import ProfileReport +from pandas_profiling.utils.cache import cache_file + +file_name = cache_file( + "titanic.csv", + "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv", +) + +df = pd.read_csv(file_name) + +profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True) + +# Example 1 +# Obtain expectation suite, this includes profiling the dataset, saving the expectation suite, validating the +# dataframe, and building data docs +suite = profile.to_expectation_suite(suite_name="titanic_expectations") + +# Example 2 +# Run Great Expectations while specifying the directory with an existing Great Expectations set-up by passing in a +# Data Context +data_context = ge.data_context.DataContext(context_root_dir="my_ge_root_directory/") + +suite = profile.to_expectation_suite( + suite_name="titanic_expectations", data_context=data_context +) + +# Example 3 +# Just build the suite +suite = profile.to_expectation_suite( + suite_name="titanic_expectations", + save_suite=False, + run_validation=False, + build_data_docs=False, +) + +# Example 4 +# If you would like to use the method to just build the suite, and then manually save the suite, validate the dataframe, +# and build data docs + +# First instantiate a data_context +data_context = ge.data_context.DataContext(context_root_dir="my_ge_root_directory/") + +# Create the suite +suite = profile.to_expectation_suite( + suite_name="titanic_expectations", + data_context=data_context, + save_suite=False, + run_validation=False, + build_data_docs=False, +) + +# Save the suite +data_context.save_expectation_suite(suite) + +# Run validation on your dataframe +batch = ge.dataset.PandasDataset(df, expectation_suite=suite) + +results = data_context.run_validation_operator( + "action_list_operator", assets_to_validate=[batch] +) +validation_result_identifier = results.list_validation_result_identifiers()[0] + +# Build and open data docs +data_context.build_data_docs() +data_context.open_data_docs(validation_result_identifier) diff --git a/setup.py b/setup.py index 5d384a808..3a7f8f4f9 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ with (source_root / "requirements.txt").open(encoding="utf8") as f: requirements = f.readlines() -version = "2.10.1" +version = "2.11.0" with (source_root / "src" / "pandas_profiling" / "version.py").open( "w", encoding="utf-8" diff --git a/src/pandas_profiling/config_default.yaml b/src/pandas_profiling/config_default.yaml index bf1831a7f..42c1b1368 100644 --- a/src/pandas_profiling/config_default.yaml +++ b/src/pandas_profiling/config_default.yaml @@ -13,6 +13,9 @@ dataset: variables: descriptions: {} +# infer dtypes +infer_dtypes: True + # Show the description at each variable (in addition to the overview tab) show_variable_description: True diff --git a/src/pandas_profiling/config_minimal.yaml b/src/pandas_profiling/config_minimal.yaml index c8209852a..e1aacad3d 100644 --- a/src/pandas_profiling/config_minimal.yaml +++ b/src/pandas_profiling/config_minimal.yaml @@ -13,6 +13,9 @@ dataset: variables: descriptions: {} +# infer dtypes +infer_dtypes: True + # Show the description at each variable (in addition to the overview tab) show_variable_description: True diff --git a/src/pandas_profiling/controller/console.py b/src/pandas_profiling/controller/console.py index 22809a606..702b51fc7 100644 --- a/src/pandas_profiling/controller/console.py +++ b/src/pandas_profiling/controller/console.py @@ -60,6 +60,20 @@ def parse_args(args: Union[list, None] = None) -> argparse.Namespace: help="Title for the report", ) + parser.add_argument( + "--infer_dtypes", + default=False, + action="store_true", + help="To infer dtypes of the dataframe", + ) + + parser.add_argument( + "--no-infer_dtypes", + dest="infer_dtypes", + action="store_false", + help="To read dtypes as read by pandas", + ) + parser.add_argument( "--config_file", type=str, diff --git a/src/pandas_profiling/expectations_report.py b/src/pandas_profiling/expectations_report.py new file mode 100644 index 000000000..f1197147c --- /dev/null +++ b/src/pandas_profiling/expectations_report.py @@ -0,0 +1,115 @@ +from pandas_profiling.model import expectation_algorithms +from pandas_profiling.model.handler import Handler +from pandas_profiling.model.typeset import ( + URL, + Boolean, + Categorical, + DateTime, + File, + Image, + Numeric, + Path, + Unsupported, +) +from pandas_profiling.utils.dataframe import slugify + + +# Default handler +class ExpectationHandler(Handler): + def __init__(self, typeset, *args, **kwargs): + mapping = { + Unsupported: [expectation_algorithms.generic_expectations], + Categorical: [expectation_algorithms.categorical_expectations], + Boolean: [expectation_algorithms.categorical_expectations], + Numeric: [expectation_algorithms.numeric_expectations], + URL: [expectation_algorithms.url_expectations], + File: [expectation_algorithms.file_expectations], + Path: [expectation_algorithms.path_expectations], + DateTime: [expectation_algorithms.datetime_expectations], + Image: [expectation_algorithms.image_expectations], + } + super().__init__(mapping, typeset, *args, **kwargs) + + +class ExpectationsReport: + def to_expectation_suite( + self, + suite_name=None, + data_context=None, + save_suite=True, + run_validation=True, + build_data_docs=True, + handler=None, + ): + """ + All parameters default to True to make it easier to access the full functionality of Great Expectations out of + the box. + Args: + suite_name: The name of your expectation suite + data_context: A user-specified data context + save_suite: Boolean to determine whether to save the suite to .json as part of the method + run_validation: Boolean to determine whether to run validation as part of the method + build_data_docs: Boolean to determine whether to build data docs, save the .html file, and open data docs in + your browser + handler: The handler to use for building expectation + + Returns: + An ExpectationSuite + """ + try: + import great_expectations as ge + except ImportError: + raise ImportError( + "Please install great expectations before using the expectation functionality" + ) + + # Use report title if suite is empty + if suite_name is None: + suite_name = slugify(self.title) + + # Use the default handler if none + if handler is None: + handler = ExpectationHandler(self.typeset) + + # Obtain the ge context and create the expectation suite + if not data_context: + data_context = ge.data_context.DataContext() + + suite = data_context.create_expectation_suite( + suite_name, overwrite_existing=True + ) + + # Instantiate an in-memory pandas dataset + batch = ge.dataset.PandasDataset(self.df, expectation_suite=suite) + + # Obtain the profiling summary + summary = self.get_description() + + # Dispatch to expectations per semantic variable type + for name, variable_summary in summary["variables"].items(): + handler.handle(variable_summary["type"], name, variable_summary, batch) + + # We don't actually update the suite object on the batch in place, so need + # to get the populated suite from the batch + suite = batch.get_expectation_suite() + + validation_result_identifier = None + if run_validation: + batch = ge.dataset.PandasDataset(self.df, expectation_suite=suite) + + results = data_context.run_validation_operator( + "action_list_operator", assets_to_validate=[batch] + ) + validation_result_identifier = results.list_validation_result_identifiers()[ + 0 + ] + + # Write expectations and open data docs + if save_suite or build_data_docs: + data_context.save_expectation_suite(suite) + + if build_data_docs: + data_context.build_data_docs() + data_context.open_data_docs(validation_result_identifier) + + return batch.get_expectation_suite() diff --git a/src/pandas_profiling/model/expectation_algorithms.py b/src/pandas_profiling/model/expectation_algorithms.py new file mode 100644 index 000000000..83e748a36 --- /dev/null +++ b/src/pandas_profiling/model/expectation_algorithms.py @@ -0,0 +1,90 @@ +def generic_expectations(name, summary, batch, *args): + batch.expect_column_to_exist(name) + + if summary["n_missing"] == 0: + batch.expect_column_values_to_not_be_null(name) + + if summary["p_unique"] == 1.0: + batch.expect_column_values_to_be_unique(name) + + return name, summary, batch + + +def numeric_expectations(name, summary, batch, *args): + from great_expectations.profile.base import ProfilerTypeMapping + + numeric_type_names = ( + ProfilerTypeMapping.INT_TYPE_NAMES + ProfilerTypeMapping.FLOAT_TYPE_NAMES + ) + + batch.expect_column_values_to_be_in_type_list( + name, + numeric_type_names, + meta={ + "notes": { + "format": "markdown", + "content": [ + "The column values should be stored in one of these types." + ], + } + }, + ) + + if summary["monotonic_increase"]: + batch.expect_column_values_to_be_increasing( + name, strictly=summary["monotonic_increase_strict"] + ) + + if summary["monotonic_decrease"]: + batch.expect_column_values_to_be_decreasing( + name, strictly=summary["monotonic_decrease_strict"] + ) + + if any(k in summary for k in ["min", "max"]): + batch.expect_column_values_to_be_between( + name, min_value=summary.get("min"), max_value=summary.get("max") + ) + + return name, summary, batch + + +def categorical_expectations(name, summary, batch, *args): + # Use for both categorical and special case (boolean) + absolute_threshold = 10 + relative_threshold = 0.2 + if ( + summary["n_distinct"] < absolute_threshold + or summary["p_distinct"] < relative_threshold + ): + batch.expect_column_values_to_be_in_set( + name, set(summary["value_counts_without_nan"].keys()) + ) + return name, summary, batch + + +def path_expectations(name, summary, batch, *args): + return name, summary, batch + + +def datetime_expectations(name, summary, batch, *args): + if any(k in summary for k in ["min", "max"]): + batch.expect_column_values_to_be_between( + name, min_value=summary.get("min"), max_value=summary.get("max") + ) + + return name, summary, batch + + +def image_expectations(name, summary, batch, *args): + return name, summary, batch + + +def url_expectations(name, summary, batch, *args): + return name, summary, batch + + +def file_expectations(name, summary, batch, *args): + # By definition within our type logic, a file exists (as it's a path that also exists) + batch.expect_file_to_exist(name) + + return name, summary, batch diff --git a/src/pandas_profiling/model/handler.py b/src/pandas_profiling/model/handler.py index 8587ae001..7fae0ca5a 100644 --- a/src/pandas_profiling/model/handler.py +++ b/src/pandas_profiling/model/handler.py @@ -1,69 +1,69 @@ -from functools import reduce -from typing import Type - -import networkx as nx -from visions import VisionsBaseType - -from pandas_profiling.model import typeset as ppt - - -def compose(functions): - """ - Compose a sequence of functions - :param functions: sequence of functions - :return: combined functions, e.g. [f(x), g(x)] -> g(f(x)) - """ - - def func(f, g): - def func2(*x): - res = g(*x) - if type(res) == bool: - return False - else: - return f(*res) - - return func2 - - return reduce(func, reversed(functions), lambda *x: x) - - -class Handler: - def __init__(self, mapping, typeset, *args, **kwargs): - self.mapping = mapping - self.typeset = typeset - - self._complete_dag() - - def _complete_dag(self): - for from_type, to_type in nx.topological_sort( - nx.line_graph(self.typeset.base_graph) - ): - self.mapping[to_type] = self.mapping[from_type] + self.mapping[to_type] - - def handle(self, dtype: Type[VisionsBaseType], *args, **kwargs) -> dict: - """ - - Returns: - object: - """ - op = compose(self.mapping.get(dtype, [])) - return op(*args) - - -def get_render_map(): - import pandas_profiling.report.structure.variables as render_algorithms - - render_map = { - ppt.Boolean: render_algorithms.render_boolean, - ppt.Numeric: render_algorithms.render_real, - ppt.Complex: render_algorithms.render_complex, - ppt.DateTime: render_algorithms.render_date, - ppt.Categorical: render_algorithms.render_categorical, - ppt.URL: render_algorithms.render_url, - ppt.Path: render_algorithms.render_path, - ppt.File: render_algorithms.render_file, - ppt.Image: render_algorithms.render_image, - ppt.Unsupported: render_algorithms.render_generic, - } - - return render_map +from functools import reduce +from typing import Type + +import networkx as nx +from visions import VisionsBaseType + +from pandas_profiling.model import typeset as ppt + + +def compose(functions): + """ + Compose a sequence of functions + :param functions: sequence of functions + :return: combined functions, e.g. [f(x), g(x)] -> g(f(x)) + """ + + def func(f, g): + def func2(*x): + res = g(*x) + if type(res) == bool: + return f(*x) + else: + return f(*res) + + return func2 + + return reduce(func, reversed(functions), lambda *x: x) + + +class Handler: + def __init__(self, mapping, typeset, *args, **kwargs): + self.mapping = mapping + self.typeset = typeset + + self._complete_dag() + + def _complete_dag(self): + for from_type, to_type in nx.topological_sort( + nx.line_graph(self.typeset.base_graph) + ): + self.mapping[to_type] = self.mapping[from_type] + self.mapping[to_type] + + def handle(self, dtype: Type[VisionsBaseType], *args, **kwargs) -> dict: + """ + + Returns: + object: + """ + op = compose(self.mapping.get(dtype, [])) + return op(*args) + + +def get_render_map(): + import pandas_profiling.report.structure.variables as render_algorithms + + render_map = { + ppt.Boolean: render_algorithms.render_boolean, + ppt.Numeric: render_algorithms.render_real, + ppt.Complex: render_algorithms.render_complex, + ppt.DateTime: render_algorithms.render_date, + ppt.Categorical: render_algorithms.render_categorical, + ppt.URL: render_algorithms.render_url, + ppt.Path: render_algorithms.render_path, + ppt.File: render_algorithms.render_file, + ppt.Image: render_algorithms.render_image, + ppt.Unsupported: render_algorithms.render_generic, + } + + return render_map diff --git a/src/pandas_profiling/model/summary.py b/src/pandas_profiling/model/summary.py index decd1e1e2..a579275c1 100644 --- a/src/pandas_profiling/model/summary.py +++ b/src/pandas_profiling/model/summary.py @@ -39,9 +39,15 @@ def describe_1d(series: pd.Series, summarizer: BaseSummarizer, typeset) -> dict: # Make sure pd.NA is not in the series series = series.fillna(np.nan) - # Infer variable types - vtype = typeset.infer_type(series) - series = typeset.cast_to_inferred(series) + # get `infer_dtypes` (bool) from config + infer_dtypes = config["infer_dtypes"].get(bool) + if infer_dtypes: + # Infer variable types + vtype = typeset.infer_type(series) + series = typeset.cast_to_inferred(series) + else: + # Detect variable types from pandas dataframe (df.dtypes). [new dtypes, changed using `astype` function are now considered] + vtype = typeset.detect_type(series) return summarizer.summarize(series, dtype=vtype) diff --git a/src/pandas_profiling/profile_report.py b/src/pandas_profiling/profile_report.py index 08984efb4..fd15da139 100644 --- a/src/pandas_profiling/profile_report.py +++ b/src/pandas_profiling/profile_report.py @@ -3,13 +3,16 @@ from pathlib import Path from typing import Any, Optional, Union +import attr import numpy as np import pandas as pd from tqdm.auto import tqdm from pandas_profiling.config import config +from pandas_profiling.expectations_report import ExpectationsReport from pandas_profiling.model.describe import describe as describe_df from pandas_profiling.model.messages import MessageType +from pandas_profiling.model.sample import Sample from pandas_profiling.model.summarizer import PandasProfilingSummarizer, format_summary from pandas_profiling.model.typeset import ProfilingTypeSet from pandas_profiling.report import get_report_structure @@ -21,7 +24,7 @@ from pandas_profiling.utils.paths import get_config -class ProfileReport(SerializeReport): +class ProfileReport(SerializeReport, ExpectationsReport): """Generate a profile report from a Dataset stored as a pandas `DataFrame`. Used has is it will output its content as an HTML report in a Jupyter notebook. @@ -50,6 +53,7 @@ def __init__( sample: optional dict(name="Sample title", caption="Caption", data=pd.DataFrame()) **kwargs: other arguments, for valid arguments, check the default configuration file. """ + config.clear() # to reset (previous) config. if config_file is not None and minimal: raise ValueError( "Arguments `config_file` and `minimal` are mutually exclusive." @@ -351,9 +355,13 @@ def encode_it(o): elif isinstance(o, set): return {encode_it(v) for v in o} elif isinstance(o, (pd.DataFrame, pd.Series)): - return o.to_json() + return encode_it(o.to_dict(orient="records")) elif isinstance(o, np.ndarray): return encode_it(o.tolist()) + elif isinstance(o, Sample): + return encode_it(attr.asdict(o)) + elif isinstance(o, np.generic): + return o.item() else: return str(o) diff --git a/src/pandas_profiling/serialize_report.py b/src/pandas_profiling/serialize_report.py index 09acbc4fa..a959e8704 100644 --- a/src/pandas_profiling/serialize_report.py +++ b/src/pandas_profiling/serialize_report.py @@ -72,11 +72,9 @@ def loads(self, data: bytes, ignore_config: bool = False): raise ValueError( f"Failed to load data: file may be damaged or from an incompatible version" ) - if (df_hash == self.df_hash) and ( - ignore_config - or config == loaded_config - or (config.is_default and self.df is None) # load to an empty ProfileReport - ): + if (df_hash == self.df_hash) or ( + config.is_default and self.df is None + ): # load to an empty ProfileReport # Set description_set, report, sample if they are None,or raise an warning. if self._description_set is None: self._description_set = loaded_description_set @@ -91,9 +89,8 @@ def loads(self, data: bytes, ignore_config: bool = False): f"The report of current ProfileReport is not None. It won't be loaded." ) - # overwrite config if ignore_config set to True - if ignore_config: - config.update(loaded_config) + # overwrite config + config.update(loaded_config) # warn if version not equal if ( @@ -111,10 +108,7 @@ def loads(self, data: bytes, ignore_config: bool = False): self._title = loaded_title else: - raise ValueError( - "DataFrame or Config do not match with the current ProfileReport. " - 'If you want to overwrite the current configuration, use "ignore_config=True"' - ) + raise ValueError("DataFrame does not match with the current ProfileReport.") return self def dump(self, output_file: Union[Path, str]): diff --git a/src/pandas_profiling/version.py b/src/pandas_profiling/version.py index 1573d2277..1ac806f2d 100644 --- a/src/pandas_profiling/version.py +++ b/src/pandas_profiling/version.py @@ -1,2 +1,2 @@ """This file is auto-generated by setup.py, please do not alter.""" -__version__ = "2.10.1" +__version__ = "2.11.0" diff --git a/tests/issues/test_issue51.py b/tests/issues/test_issue51.py index eb6043bf3..50617ca81 100644 --- a/tests/issues/test_issue51.py +++ b/tests/issues/test_issue51.py @@ -29,7 +29,6 @@ def test_issue51(get_data_file): def test_issue51_similar(): - config["vars"]["num"]["low_categorical_threshold"] = 0 df = pd.DataFrame( { "test": ["", "hoi", None], @@ -41,6 +40,7 @@ def test_issue51_similar(): report = df.profile_report( title="Pandas Profiling Report", progress_bar=False, explorative=True ) + report.set_variable("vars.num.low_categorical_threshold", 0) # FIXME: assert correlation values # print(report.get_description()["correlations"]) @@ -50,7 +50,6 @@ def test_issue51_similar(): def test_issue51_empty(): - config["vars"]["num"]["low_categorical_threshold"] = 0 df = pd.DataFrame( { "test": ["", "", "", "", ""], @@ -64,6 +63,7 @@ def test_issue51_empty(): progress_bar=False, explorative=True, ) + report.set_variable("vars.num.low_categorical_threshold", 0) assert ( "cramers" not in report.get_description()["correlations"] @@ -75,7 +75,6 @@ def test_issue51_empty(): def test_issue51_identical(): - config["vars"]["num"]["low_categorical_threshold"] = 0 df = pd.DataFrame( { "test": ["v1", "v1", "v1"], @@ -87,6 +86,8 @@ def test_issue51_identical(): report = df.profile_report( title="Pandas Profiling Report", progress_bar=False, explorative=True ) + report.set_variable("vars.num.low_categorical_threshold", 0) + assert ( report.get_description()["correlations"]["cramers"].values == np.ones((3, 3)) ).all() diff --git a/tests/issues/test_issue664.py b/tests/issues/test_issue664.py index d5d5a38a4..71f2d63c3 100644 --- a/tests/issues/test_issue664.py +++ b/tests/issues/test_issue664.py @@ -1,15 +1,26 @@ -""" -Test for issue 664: -https://github.com/pandas-profiling/pandas-profiling/issues/664 -""" -import numpy as np -import pandas as pd - -from pandas_profiling import ProfileReport - - -def test_issue664(): - test = pd.DataFrame([np.nan] * 100, columns=["a"]) - - profile = ProfileReport(test) - assert len(profile.to_html()) > 0 +""" +Test for issue 664: +https://github.com/pandas-profiling/pandas-profiling/issues/664 +""" +import numpy as np +import pandas as pd + +from pandas_profiling import ProfileReport + + +def test_issue664(): + n = 10000 + df = pd.DataFrame({"a": [np.NaN] * n, "b": ["b"] * n, "c": [pd.NaT] * n}) + df = df.fillna(value=np.nan) + + profile = ProfileReport( + df, title="Pandas Profiling Report", explorative=True, minimal=True + ) + _ = profile.get_description() + + +def test_issue664_alt(): + test = pd.DataFrame([np.nan] * 100, columns=["a"]) + + profile = ProfileReport(test) + assert len(profile.to_html()) > 0 diff --git a/tests/issues/test_issue72.py b/tests/issues/test_issue72.py index 874e1af14..27a96c6c2 100644 --- a/tests/issues/test_issue72.py +++ b/tests/issues/test_issue72.py @@ -12,12 +12,10 @@ def test_issue72_higher(): # Showcase (and test) different ways of interfacing with config/profiling report - config["vars"]["num"]["low_categorical_threshold"].set(2) - df = pd.DataFrame({"A": [1, 2, 3, 3]}) df["B"] = df["A"].apply(str) report = pandas_profiling.ProfileReport(df, correlations=None) - + report.set_variable("vars.num.low_categorical_threshold", 2) # 3 > 2, so numerical assert report.get_description()["variables"]["A"]["type"] == Numeric # Strings are always categorical @@ -40,11 +38,10 @@ def test_issue72_equal(): def test_issue72_lower(): - config["vars"]["num"]["low_categorical_threshold"].set(10) - df = pd.DataFrame({"A": [1, 2, 3, 3, np.nan]}) df["B"] = df["A"].apply(str) report = df.profile_report(correlations=None) + report.set_variable("vars.num.low_categorical_threshold", 10) # 3 < 10, so categorical assert report.get_description()["variables"]["A"]["type"] == Categorical diff --git a/tests/unit/test_ge_integration.py b/tests/unit/test_ge_integration.py new file mode 100644 index 000000000..6d16e9582 --- /dev/null +++ b/tests/unit/test_ge_integration.py @@ -0,0 +1,108 @@ +import sys +from unittest.mock import Mock + +import pandas as pd +import pytest + +from pandas_profiling import ProfileReport + + +@pytest.fixture +def df(): + return pd.DataFrame({"num": [1, 2, 3, 4, 5]}) + + +@pytest.fixture(scope="function") +def mod(): + mod = Mock() + sys.modules["great_expectations"] = mod + return mod + + +@pytest.fixture(scope="function") +def context(): + return Mock() + + +def test_to_expectation_suite_raises(df): + report = ProfileReport(df) + with pytest.raises(ImportError): + report.to_expectation_suite() + + +def test_to_expectations_suite_context_save_and_build_data_docs(mod, context, df): + report = ProfileReport(df) + _ = report.to_expectation_suite( + data_context=context, + save_suite=True, + run_validation=False, + build_data_docs=True, + ) + + mod.data_context.DataContext.assert_not_called() + mod.dataset.PandasDataset.assert_called_once() + + context.create_expectation_suite.assert_called_once() + context.save_expectation_suite.assert_called_once() + context.build_data_docs.assert_called_once() + context.open_data_docs.assert_called_once() + + +def test_to_expectations_suite_context_no_save_and_build_data_docs(mod, context, df): + report = ProfileReport(df) + _ = report.to_expectation_suite( + data_context=context, + save_suite=False, + run_validation=False, + build_data_docs=True, + ) + + mod.data_context.DataContext.assert_not_called() + mod.dataset.PandasDataset.assert_called_once() + + context.create_expectation_suite.assert_called_once() + context.save_expectation_suite.assert_called_once() + context.build_data_docs.assert_called_once() + context.open_data_docs.assert_called_once() + + +def test_to_expectations_suite_context_no_save_and_no_build_data_docs(mod, context, df): + report = ProfileReport(df) + _ = report.to_expectation_suite( + data_context=context, + save_suite=False, + run_validation=False, + build_data_docs=False, + ) + + mod.data_context.DataContext.assert_not_called() + mod.dataset.PandasDataset.assert_called_once() + + context.create_expectation_suite.assert_called_once() + context.save_expectation_suite.assert_not_called() + context.build_data_docs.assert_not_called() + context.open_data_docs.assert_not_called() + + +def test_to_expectations_suite_title(context, df): + report = ProfileReport(df, title="Expectations Dataset") + _ = report.to_expectation_suite( + suite_name=None, + data_context=context, + run_validation=False, + ) + + context.create_expectation_suite.assert_called_once_with( + "expectations-dataset", overwrite_existing=True + ) + + +def test_to_expectation_suite_no_context(mod, df): + report = ProfileReport(df) + _ = report.to_expectation_suite( + data_context=None, + save_suite=False, + run_validation=False, + build_data_docs=False, + ) + mod.data_context.DataContext.assert_called_once() diff --git a/tests/unit/test_ge_integration_expectations.py b/tests/unit/test_ge_integration_expectations.py new file mode 100644 index 000000000..40f3850ca --- /dev/null +++ b/tests/unit/test_ge_integration_expectations.py @@ -0,0 +1,146 @@ +from unittest.mock import Mock, patch + +import pytest + +from pandas_profiling.model.expectation_algorithms import ( + categorical_expectations, + datetime_expectations, + file_expectations, + generic_expectations, + image_expectations, + numeric_expectations, + path_expectations, + url_expectations, +) + + +@pytest.fixture(scope="function") +def batch(): + return Mock() + + +def test_generic_expectations(batch): + generic_expectations("column", {"n_missing": 0, "p_unique": 1.0}, batch) + batch.expect_column_to_exist.assert_called_once() + batch.expect_column_values_to_not_be_null.assert_called_once() + batch.expect_column_values_to_be_unique.assert_called_once() + + +def test_generic_expectations_min(batch): + generic_expectations("column", {"n_missing": 1, "p_unique": 0.5}, batch) + batch.expect_column_to_exist.assert_called_once() + batch.expect_column_values_to_not_be_null.assert_not_called() + batch.expect_column_values_to_be_unique.assert_not_called() + + +orig_import = __import__ + + +def import_mock(name, *args): + if name == "great_expectations.profile.base": + mod = Mock() + mod.ProfilerTypeMapping.INT_TYPE_NAMES = [] + mod.ProfilerTypeMapping.FLOAT_TYPE_NAMES = [] + return mod + + return orig_import(name, *args) + + +@patch("builtins.__import__", side_effect=import_mock) +def test_numeric_expectations(batch): + numeric_expectations( + "column", + { + "monotonic_increase": True, + "monotonic_increase_strict": True, + "monotonic_decrease_strict": False, + "monotonic_decrease": True, + "min": -1, + "max": 5, + }, + batch, + ) + batch.expect_column_values_to_be_in_type_list.assert_called_once() + batch.expect_column_values_to_be_increasing.assert_called_once_with( + "column", strictly=True + ) + batch.expect_column_values_to_be_decreasing.assert_called_once_with( + "column", strictly=False + ) + batch.expect_column_values_to_be_between.assert_called_once_with( + "column", + min_value=-1, + max_value=5, + ) + + +@patch("builtins.__import__", side_effect=import_mock) +def test_numeric_expectations_min(batch): + numeric_expectations( + "column", + { + "monotonic_increase": False, + "monotonic_increase_strict": False, + "monotonic_decrease_strict": False, + "monotonic_decrease": False, + }, + batch, + ) + batch.expect_column_values_to_be_in_type_list.assert_called_once() + batch.expect_column_values_to_be_increasing.assert_not_called() + batch.expect_column_values_to_be_decreasing.assert_not_called() + batch.expect_column_values_to_be_between.assert_not_called() + + +def test_categorical_expectations(batch): + categorical_expectations( + "column", + { + "n_distinct": 1, + "p_distinct": 0.1, + "value_counts_without_nan": {"val1": 1, "val2": 2}, + }, + batch, + ) + batch.expect_column_values_to_be_in_set.assert_called_once_with( + "column", {"val1", "val2"} + ) + + +def test_categorical_expectations_min(batch): + categorical_expectations("column", {"n_distinct": 15, "p_distinct": 1.0}, batch) + batch.expect_column_values_to_be_in_set.assert_not_called() + + +def test_path_expectations(batch): + path_expectations("column", {}, batch) + batch.expect_column_to_exist.assert_not_called() + + +def test_datetime_expectations(batch): + datetime_expectations("column", {"min": 0, "max": 100}, batch) + batch.expect_column_values_to_be_between.assert_called_once_with( + "column", + min_value=0, + max_value=100, + ) + + +def test_datetime_expectations_min(batch): + datetime_expectations("column", {}, batch) + batch.expect_column_values_to_be_between.assert_not_called() + + +def test_image_expectations(batch): + image_expectations("column", {}, batch) + batch.expect_column_to_exist.assert_not_called() + + +def test_url_expectations(batch): + url_expectations("column", {}, batch) + batch.expect_column_to_exist.assert_not_called() + + +def test_file_expectations(batch): + file_expectations("column", {}, batch) + batch.expect_file_to_exist.assert_called_once() diff --git a/tests/unit/test_serialize.py b/tests/unit/test_serialize.py index d97e88aa1..aecc9e8f5 100644 --- a/tests/unit/test_serialize.py +++ b/tests/unit/test_serialize.py @@ -80,16 +80,6 @@ def test_load_error(): ProfileReport.clear_config() ProfileReport(df, minimal=False).loads(data, ignore_config=True) - # config not match - with pytest.raises(ValueError) as e: - ProfileReport.clear_config() - ProfileReport(df, minimal=False).loads(data) - - assert ( - str(e.value) - == 'DataFrame or Config do not match with the current ProfileReport. If you want to overwrite the current configuration, use "ignore_config=True"' - ) - # df not match with pytest.raises(ValueError) as e: ProfileReport.clear_config() @@ -97,7 +87,4 @@ def test_load_error(): data, ignore_config=True ) - assert ( - str(e.value) - == 'DataFrame or Config do not match with the current ProfileReport. If you want to overwrite the current configuration, use "ignore_config=True"' - ) + assert str(e.value) == "DataFrame does not match with the current ProfileReport."