From 89a5b5ddb9b25bf22a199ab8ab93eb79c57f2afe Mon Sep 17 00:00:00 2001 From: aditya-balachander Date: Thu, 19 Dec 2024 09:57:09 +0530 Subject: [PATCH] @W-17427085: Set ANNOY related dependencies to be optional (#3858) Changes: - Remove `"annoy", "numpy", "pandas", "scikit-learn"` from dependencies under `pyproject.toml` and add them under optional dependencies - Created flag `OPTIONAL_DEPENDENCIES_AVAILABLE`, to indicate if ANNOY related dependencies are present in `select_utils.py`. If these optional dependencies are not available, for high volume of records (i.e. `complexity_constant >= 1000`), still Levenshtein Distance based selection will apply. - Skipped those pytests which have dependencies on `pandas` and ANNOY related optional dependencies under `test_select_utils.py` - Adding a warning message for non-zero similarity score when using ANNOY (for high volume of records). Updated the docs as well - Added additional workflow to run all unit tests with all optional dependencies installed --- .github/workflows/feature_test.yml | 24 +++++++++ cumulusci/tasks/bulkdata/select_utils.py | 33 +++++++++--- .../tasks/bulkdata/tests/test_select_utils.py | 50 ++++++++++++++++++- docs/data.md | 3 ++ pyproject.toml | 12 +++-- 5 files changed, 111 insertions(+), 11 deletions(-) diff --git a/.github/workflows/feature_test.yml b/.github/workflows/feature_test.yml index 9433041f85..001f4b9faf 100644 --- a/.github/workflows/feature_test.yml +++ b/.github/workflows/feature_test.yml @@ -63,6 +63,30 @@ jobs: - name: Run Pytest run: uv run pytest --cov-report= --cov=cumulusci + unit_tests_opt_deps: + name: "Unit tests with optional dependencies: ${{ matrix.os }}-${{ matrix.python-version }}" + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest, SFDO-Tooling-Ubuntu, SFDO-Tooling-Windows] + python-version: ["3.11", "3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "${{ matrix.python-version }}" + - name: Set up uv + uses: SFDO-Tooling/setup-uv@main + with: + version: "0.5.0" + enable-cache: true + - name: Install dependencies + run: uv sync --all-extras -p ${{ matrix.python-version }} + - name: Run Pytest + run: uv run pytest --cov-report= --cov=cumulusci + robot_api: name: "Robot: No browser" runs-on: SFDO-Tooling-Ubuntu diff --git a/cumulusci/tasks/bulkdata/select_utils.py b/cumulusci/tasks/bulkdata/select_utils.py index b15389402b..b37aa457ad 100644 --- a/cumulusci/tasks/bulkdata/select_utils.py +++ b/cumulusci/tasks/bulkdata/select_utils.py @@ -1,22 +1,37 @@ +import logging import random import re import typing as T from enum import Enum -import numpy as np -import pandas as pd -from annoy import AnnoyIndex from pydantic import Field, root_validator, validator -from sklearn.feature_extraction.text import HashingVectorizer -from sklearn.preprocessing import StandardScaler from cumulusci.core.enums import StrEnum from cumulusci.tasks.bulkdata.extract_dataset_utils.hardcoded_default_declarations import ( DEFAULT_DECLARATIONS, ) from cumulusci.tasks.bulkdata.utils import CaseInsensitiveDict +from cumulusci.utils import get_cci_upgrade_command from cumulusci.utils.yaml.model_parser import CCIDictModel +logger = logging.getLogger(__name__) +try: + import numpy as np + import pandas as pd + from annoy import AnnoyIndex + from sklearn.feature_extraction.text import HashingVectorizer + from sklearn.preprocessing import StandardScaler + + OPTIONAL_DEPENDENCIES_AVAILABLE = True +except ImportError: + logger.warning( + f"Optional dependencies are missing. " + "Handling high volumes of records for the 'select' functionality will be significantly slower, " + "as optimizations for this feature are currently disabled. " + f"To enable optimized performance, install all required dependencies using: {get_cci_upgrade_command()}[select]\n" + ) + OPTIONAL_DEPENDENCIES_AVAILABLE = False + class SelectStrategy(StrEnum): """Enum defining the different selection strategies requested.""" @@ -308,7 +323,7 @@ def similarity_post_process( select_records = [] insert_records = [] - if complexity_constant < 1000: + if complexity_constant < 1000 or not OPTIONAL_DEPENDENCIES_AVAILABLE: select_records, insert_records = levenshtein_post_process( load_records, query_records, fields, weights, threshold ) @@ -328,6 +343,12 @@ def annoy_post_process( threshold: T.Union[float, None], ) -> T.Tuple[T.List[dict], list]: """Processes the query results for the similarity selection strategy using Annoy algorithm for large number of records""" + # Add warning when threshold is 0 + if threshold is not None and threshold == 0: + logger.warning( + "Warning: A threshold of 0 may miss exact matches in high volumes. Use a small value like 0.1 for better accuracy." + ) + selected_records = [] insertion_candidates = [] diff --git a/cumulusci/tasks/bulkdata/tests/test_select_utils.py b/cumulusci/tasks/bulkdata/tests/test_select_utils.py index 447cdccef6..589f66806a 100644 --- a/cumulusci/tasks/bulkdata/tests/test_select_utils.py +++ b/cumulusci/tasks/bulkdata/tests/test_select_utils.py @@ -1,7 +1,7 @@ -import pandas as pd import pytest from cumulusci.tasks.bulkdata.select_utils import ( + OPTIONAL_DEPENDENCIES_AVAILABLE, SelectOperationExecutor, SelectStrategy, add_limit_offset_to_user_filter, @@ -15,6 +15,14 @@ vectorize_records, ) +# Check for pandas availability +try: + import pandas as pd + + PANDAS_AVAILABLE = True +except ImportError: + PANDAS_AVAILABLE = False + # Test Cases for standard_generate_query def test_standard_generate_query_with_default_record_declaration(): @@ -511,6 +519,10 @@ def test_calculate_levenshtein_distance_weights_length_doesnt_match(): assert "Records must be same size as fields (weights)." in str(e.value) +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_all_numeric_columns(): df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", " 5.5", "6.5"]}) df_query = pd.DataFrame({"A": ["4", "5", ""], "B": ["4.5", "5.5", "6.5"]}) @@ -526,6 +538,10 @@ def test_all_numeric_columns(): assert determine_field_types(df_db, df_query, weights) == expected_output +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_numeric_columns__one_non_numeric(): df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", "5.5", "6.5"]}) df_query = pd.DataFrame({"A": ["4", "5", "6"], "B": ["abcd", "5.5", "6.5"]}) @@ -541,6 +557,10 @@ def test_numeric_columns__one_non_numeric(): assert determine_field_types(df_db, df_query, weights) == expected_output +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_all_boolean_columns(): df_db = pd.DataFrame( {"A": ["true", "false", "true"], "B": ["false", "true", "false"]} @@ -560,6 +580,10 @@ def test_all_boolean_columns(): assert determine_field_types(df_db, df_query, weights) == expected_output +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_all_categorical_columns(): df_db = pd.DataFrame( {"A": ["apple", "banana", "cherry"], "B": ["dog", "cat", "mouse"]} @@ -579,6 +603,10 @@ def test_all_categorical_columns(): assert determine_field_types(df_db, df_query, weights) == expected_output +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_mixed_types(): df_db = pd.DataFrame( { @@ -606,6 +634,10 @@ def test_mixed_types(): assert determine_field_types(df_db, df_query, weights) == expected_output +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_vectorize_records_mixed_numerical_boolean_categorical(): # Test data with mixed types: numerical and categorical only db_records = [["1.0", "true", "apple"], ["2.0", "false", "banana"]] @@ -633,6 +665,10 @@ def test_vectorize_records_mixed_numerical_boolean_categorical(): ), "Query vectors column count mismatch" +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_annoy_post_process(): # Test data load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]] @@ -659,6 +695,10 @@ def test_annoy_post_process(): assert not insert_records +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_annoy_post_process__insert_records(): # Test data load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]] @@ -714,6 +754,10 @@ def test_annoy_post_process__no_query_records(): ] # The first insert record should match the second load record +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_annoy_post_process__insert_records_with_polymorphic_fields(): # Test data load_records = [ @@ -749,6 +793,10 @@ def test_annoy_post_process__insert_records_with_polymorphic_fields(): ] # The first insert record should match the second load record +@pytest.mark.skipif( + not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE, + reason="requires optional dependencies for annoy", +) def test_single_record_match_annoy_post_process(): # Mock data where only the first query record matches the first load record load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]] diff --git a/docs/data.md b/docs/data.md index fe9396a4ae..ba61076315 100644 --- a/docs/data.md +++ b/docs/data.md @@ -352,6 +352,9 @@ This parameter is **optional**; if not specified, no threshold will be applied a This feature is particularly useful during version upgrades, where records that closely match can be selected, while those that do not match sufficiently can be inserted into the target org. +**Important Note:** +For high volumes of records, an approximation algorithm is applied to improve performance. In such cases, setting a threshold of `0` may not guarantee the selection of exact matches, as the algorithm can assign a small non-zero similarity score to exact matches. To ensure accurate selection, it is recommended to set the threshold to a small value slightly greater than `0`, such as `0.1`. This ensures both precision and efficiency in the selection process. + --- #### Example diff --git a/pyproject.toml b/pyproject.toml index 7dec9eedab..d840b1eb9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,6 @@ classifiers = [ "Programming Language :: Python :: 3.13", ] dependencies = [ - "annoy", "click>=8.1", "cryptography", "python-dateutil", @@ -35,8 +34,6 @@ dependencies = [ "defusedxml", "lxml", "MarkupSafe", - "numpy", - "pandas", "psutil", "pydantic<2", "PyJWT", @@ -53,7 +50,6 @@ dependencies = [ "rst2ansi>=0.1.5", "salesforce-bulk", "sarge", - "scikit-learn", "selenium<4", "simple-salesforce==1.11.4", "snowfakery>=4.0.0", @@ -88,6 +84,14 @@ lint = [ "pre-commit>=3.5.0", ] +[project.optional-dependencies] +select = [ + "annoy", + "numpy", + "pandas", + "scikit-learn", +] + [project.scripts] cci = "cumulusci.cli.cci:main" snowfakery = "snowfakery.cli:main"