Skip to content

Commit

Permalink
@W-17427085: Set ANNOY related dependencies to be optional (#3858)
Browse files Browse the repository at this point in the history
Changes:
- Remove `"annoy", "numpy", "pandas", "scikit-learn"` from dependencies
under `pyproject.toml` and add them under optional dependencies
- Created flag `OPTIONAL_DEPENDENCIES_AVAILABLE`, to indicate if ANNOY
related dependencies are present in `select_utils.py`. If these optional
dependencies are not available, for high volume of records (i.e.
`complexity_constant >= 1000`), still Levenshtein Distance based
selection will apply.
- Skipped those pytests which have dependencies on `pandas` and ANNOY
related optional dependencies under `test_select_utils.py`
- Adding a warning message for non-zero similarity score when using
ANNOY (for high volume of records). Updated the docs as well
- Added additional workflow to run all unit tests with all optional
dependencies installed
  • Loading branch information
aditya-balachander authored Dec 19, 2024
1 parent 534210c commit 89a5b5d
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 11 deletions.
24 changes: 24 additions & 0 deletions .github/workflows/feature_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,30 @@ jobs:
- name: Run Pytest
run: uv run pytest --cov-report= --cov=cumulusci

unit_tests_opt_deps:
name: "Unit tests with optional dependencies: ${{ matrix.os }}-${{ matrix.python-version }}"
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [macos-latest, SFDO-Tooling-Ubuntu, SFDO-Tooling-Windows]
python-version: ["3.11", "3.12", "3.13"]
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "${{ matrix.python-version }}"
- name: Set up uv
uses: SFDO-Tooling/setup-uv@main
with:
version: "0.5.0"
enable-cache: true
- name: Install dependencies
run: uv sync --all-extras -p ${{ matrix.python-version }}
- name: Run Pytest
run: uv run pytest --cov-report= --cov=cumulusci

robot_api:
name: "Robot: No browser"
runs-on: SFDO-Tooling-Ubuntu
Expand Down
33 changes: 27 additions & 6 deletions cumulusci/tasks/bulkdata/select_utils.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,37 @@
import logging
import random
import re
import typing as T
from enum import Enum

import numpy as np
import pandas as pd
from annoy import AnnoyIndex
from pydantic import Field, root_validator, validator
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import StandardScaler

from cumulusci.core.enums import StrEnum
from cumulusci.tasks.bulkdata.extract_dataset_utils.hardcoded_default_declarations import (
DEFAULT_DECLARATIONS,
)
from cumulusci.tasks.bulkdata.utils import CaseInsensitiveDict
from cumulusci.utils import get_cci_upgrade_command
from cumulusci.utils.yaml.model_parser import CCIDictModel

logger = logging.getLogger(__name__)
try:
import numpy as np
import pandas as pd
from annoy import AnnoyIndex
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import StandardScaler

OPTIONAL_DEPENDENCIES_AVAILABLE = True
except ImportError:
logger.warning(
f"Optional dependencies are missing. "
"Handling high volumes of records for the 'select' functionality will be significantly slower, "
"as optimizations for this feature are currently disabled. "
f"To enable optimized performance, install all required dependencies using: {get_cci_upgrade_command()}[select]\n"
)
OPTIONAL_DEPENDENCIES_AVAILABLE = False


class SelectStrategy(StrEnum):
"""Enum defining the different selection strategies requested."""
Expand Down Expand Up @@ -308,7 +323,7 @@ def similarity_post_process(
select_records = []
insert_records = []

if complexity_constant < 1000:
if complexity_constant < 1000 or not OPTIONAL_DEPENDENCIES_AVAILABLE:
select_records, insert_records = levenshtein_post_process(
load_records, query_records, fields, weights, threshold
)
Expand All @@ -328,6 +343,12 @@ def annoy_post_process(
threshold: T.Union[float, None],
) -> T.Tuple[T.List[dict], list]:
"""Processes the query results for the similarity selection strategy using Annoy algorithm for large number of records"""
# Add warning when threshold is 0
if threshold is not None and threshold == 0:
logger.warning(
"Warning: A threshold of 0 may miss exact matches in high volumes. Use a small value like 0.1 for better accuracy."
)

selected_records = []
insertion_candidates = []

Expand Down
50 changes: 49 additions & 1 deletion cumulusci/tasks/bulkdata/tests/test_select_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import pytest

from cumulusci.tasks.bulkdata.select_utils import (
OPTIONAL_DEPENDENCIES_AVAILABLE,
SelectOperationExecutor,
SelectStrategy,
add_limit_offset_to_user_filter,
Expand All @@ -15,6 +15,14 @@
vectorize_records,
)

# Check for pandas availability
try:
import pandas as pd

PANDAS_AVAILABLE = True
except ImportError:
PANDAS_AVAILABLE = False


# Test Cases for standard_generate_query
def test_standard_generate_query_with_default_record_declaration():
Expand Down Expand Up @@ -511,6 +519,10 @@ def test_calculate_levenshtein_distance_weights_length_doesnt_match():
assert "Records must be same size as fields (weights)." in str(e.value)


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_all_numeric_columns():
df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", " 5.5", "6.5"]})
df_query = pd.DataFrame({"A": ["4", "5", ""], "B": ["4.5", "5.5", "6.5"]})
Expand All @@ -526,6 +538,10 @@ def test_all_numeric_columns():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_numeric_columns__one_non_numeric():
df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", "5.5", "6.5"]})
df_query = pd.DataFrame({"A": ["4", "5", "6"], "B": ["abcd", "5.5", "6.5"]})
Expand All @@ -541,6 +557,10 @@ def test_numeric_columns__one_non_numeric():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_all_boolean_columns():
df_db = pd.DataFrame(
{"A": ["true", "false", "true"], "B": ["false", "true", "false"]}
Expand All @@ -560,6 +580,10 @@ def test_all_boolean_columns():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_all_categorical_columns():
df_db = pd.DataFrame(
{"A": ["apple", "banana", "cherry"], "B": ["dog", "cat", "mouse"]}
Expand All @@ -579,6 +603,10 @@ def test_all_categorical_columns():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_mixed_types():
df_db = pd.DataFrame(
{
Expand Down Expand Up @@ -606,6 +634,10 @@ def test_mixed_types():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_vectorize_records_mixed_numerical_boolean_categorical():
# Test data with mixed types: numerical and categorical only
db_records = [["1.0", "true", "apple"], ["2.0", "false", "banana"]]
Expand Down Expand Up @@ -633,6 +665,10 @@ def test_vectorize_records_mixed_numerical_boolean_categorical():
), "Query vectors column count mismatch"


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_annoy_post_process():
# Test data
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
Expand All @@ -659,6 +695,10 @@ def test_annoy_post_process():
assert not insert_records


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_annoy_post_process__insert_records():
# Test data
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
Expand Down Expand Up @@ -714,6 +754,10 @@ def test_annoy_post_process__no_query_records():
] # The first insert record should match the second load record


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_annoy_post_process__insert_records_with_polymorphic_fields():
# Test data
load_records = [
Expand Down Expand Up @@ -749,6 +793,10 @@ def test_annoy_post_process__insert_records_with_polymorphic_fields():
] # The first insert record should match the second load record


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_single_record_match_annoy_post_process():
# Mock data where only the first query record matches the first load record
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
Expand Down
3 changes: 3 additions & 0 deletions docs/data.md
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,9 @@ This parameter is **optional**; if not specified, no threshold will be applied a

This feature is particularly useful during version upgrades, where records that closely match can be selected, while those that do not match sufficiently can be inserted into the target org.

**Important Note:**
For high volumes of records, an approximation algorithm is applied to improve performance. In such cases, setting a threshold of `0` may not guarantee the selection of exact matches, as the algorithm can assign a small non-zero similarity score to exact matches. To ensure accurate selection, it is recommended to set the threshold to a small value slightly greater than `0`, such as `0.1`. This ensures both precision and efficiency in the selection process.

---

#### Example
Expand Down
12 changes: 8 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ classifiers = [
"Programming Language :: Python :: 3.13",
]
dependencies = [
"annoy",
"click>=8.1",
"cryptography",
"python-dateutil",
Expand All @@ -35,8 +34,6 @@ dependencies = [
"defusedxml",
"lxml",
"MarkupSafe",
"numpy",
"pandas",
"psutil",
"pydantic<2",
"PyJWT",
Expand All @@ -53,7 +50,6 @@ dependencies = [
"rst2ansi>=0.1.5",
"salesforce-bulk",
"sarge",
"scikit-learn",
"selenium<4",
"simple-salesforce==1.11.4",
"snowfakery>=4.0.0",
Expand Down Expand Up @@ -88,6 +84,14 @@ lint = [
"pre-commit>=3.5.0",
]

[project.optional-dependencies]
select = [
"annoy",
"numpy",
"pandas",
"scikit-learn",
]

[project.scripts]
cci = "cumulusci.cli.cci:main"
snowfakery = "snowfakery.cli:main"
Expand Down

0 comments on commit 89a5b5d

Please sign in to comment.