Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
jstvz authored Dec 6, 2024
2 parents 105f575 + 5192132 commit 5b26be5
Show file tree
Hide file tree
Showing 21 changed files with 2,361 additions and 73 deletions.
47 changes: 21 additions & 26 deletions cumulusci/tasks/bulkdata/select_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,9 +352,6 @@ def annoy_post_process(
insertion_candidates = load_shaped_records
return selected_records, insertion_candidates

query_records = replace_empty_strings_with_missing(query_records)
select_shaped_records = replace_empty_strings_with_missing(select_shaped_records)

hash_features = 100
num_trees = 10

Expand Down Expand Up @@ -400,7 +397,7 @@ def annoy_post_process(
# Retrieve the corresponding record from the database
record = query_record_data[neighbor_index]
closest_record_id = record_to_id_map[tuple(record)]
if threshold and (neighbor_distances[idx] >= threshold):
if threshold is not None and (neighbor_distances[idx] >= threshold):
selected_records.append(None)
insertion_candidates.append(load_shaped_records[i])
else:
Expand Down Expand Up @@ -448,7 +445,7 @@ def levenshtein_post_process(
select_record, target_records, similarity_weights
)

if distance_threshold and match_distance > distance_threshold:
if distance_threshold is not None and match_distance > distance_threshold:
# Append load record for insertion if distance exceeds threshold
insertion_candidates.append(load_record)
selected_records.append(None)
Expand Down Expand Up @@ -589,7 +586,7 @@ def add_limit_offset_to_user_filter(
return f" {filter_clause}"


def determine_field_types(df, weights):
def determine_field_types(df_db, df_query, weights):
numerical_features = []
boolean_features = []
categorical_features = []
Expand All @@ -598,23 +595,35 @@ def determine_field_types(df, weights):
boolean_weights = []
categorical_weights = []

for col, weight in zip(df.columns, weights):
for col, weight in zip(df_db.columns, weights):
# Check if the column can be converted to numeric
try:
# Attempt to convert to numeric
df[col] = pd.to_numeric(df[col], errors="raise")
temp_df_db = pd.to_numeric(df_db[col], errors="raise")
temp_df_query = pd.to_numeric(df_query[col], errors="raise")
# Replace empty values with 0 for numerical features
df_db[col] = temp_df_db.fillna(0).replace("", 0)
df_query[col] = temp_df_query.fillna(0).replace("", 0)
numerical_features.append(col)
numerical_weights.append(weight)
except ValueError:
# Check for boolean values
if df[col].str.lower().isin(["true", "false"]).all():
if (
df_db[col].str.lower().isin(["true", "false"]).all()
and df_query[col].str.lower().isin(["true", "false"]).all()
):
# Map to actual boolean values
df[col] = df[col].str.lower().map({"true": True, "false": False})
df_db[col] = df_db[col].str.lower().map({"true": True, "false": False})
df_query[col] = (
df_query[col].str.lower().map({"true": True, "false": False})
)
boolean_features.append(col)
boolean_weights.append(weight)
else:
categorical_features.append(col)
categorical_weights.append(weight)
# Replace empty values with 'missing' for categorical features
df_db[col] = df_db[col].replace("", "missing")
df_query[col] = df_query[col].replace("", "missing")

return (
numerical_features,
Expand All @@ -640,14 +649,7 @@ def vectorize_records(db_records, query_records, hash_features, weights):
numerical_weights,
boolean_weights,
categorical_weights,
) = determine_field_types(df_db, weights)

# Modify query dataframe boolean columns to True or False
for col in df_query.columns:
if df_query[col].str.lower().isin(["true", "false"]).all():
df_query[col] = (
df_query[col].str.lower().map({"true": True, "false": False})
)
) = determine_field_types(df_db, df_query, weights)

# Fit StandardScaler on the numerical features of the database records
scaler = StandardScaler()
Expand Down Expand Up @@ -705,13 +707,6 @@ def vectorize_records(db_records, query_records, hash_features, weights):
return final_db_vectors, final_query_vectors


def replace_empty_strings_with_missing(records):
return [
[(field if field != "" else "missing") for field in record]
for record in records
]


def split_and_filter_fields(fields: T.List[str]) -> T.Tuple[T.List[str], T.List[str]]:
# List to store non-lookup fields (load fields)
load_fields = []
Expand Down
4 changes: 4 additions & 0 deletions cumulusci/tasks/bulkdata/step.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,9 +478,11 @@ def select_records(self, records):
)

# Execute the main select query using Bulk API
self.logger.info("Retrieving records from org...")
select_query_records = self._execute_select_query(
select_query=select_query, query_fields=query_fields
)
self.logger.info(f"Retrieved {len(select_query_records)} from org")

query_records.extend(select_query_records)
# Post-process the query results
Expand Down Expand Up @@ -895,7 +897,9 @@ def select_records(self, records):
)

# Execute the query and gather the records
self.logger.info("Retrieving records from org...")
query_records = self._execute_soql_query(select_query, query_fields)
self.logger.info(f"Retrieved {len(query_records)} from org")

# Post-process the query results for this batch
(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
version: 1
interactions:
- &id001
include_file: GET_sobjects_Global_describe.yaml
- &id002
include_file: GET_sobjects_Account_describe.yaml
- *id001
- *id002
- *id002

- &id003
include_file: GET_sobjects_Contact_describe.yaml
- *id001
- *id003
- *id003
- &id007
include_file: GET_sobjects_Opportunity_describe.yaml
- *id002
- &id008
include_file: GET_sobjects_Lead_describe.yaml # Added interaction for Lead
- *id001
- &id009
include_file: GET_sobjects_Event_describe.yaml # Added interaction for Event
- *id001
- *id008
- *id001
- *id009
- *id001

- request:
method: GET
uri: https://orgname.my.salesforce.com/services/data/v62.0/limits/recordCount?sObjects=Account
body: null
headers: &id004
Request-Headers:
- Elided
response:
status:
code: 200
message: OK
headers: &id006
Content-Type:
- application/json;charset=UTF-8
Others: Elided
body:
string: "{\n \"sObjects\" : [ {\n \"count\" : 3,\n \"name\" : \"Account\"\n
\ } ]\n}"

- request:
method: GET
uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id,%20Name,%20Description,%20Phone,%20AccountNumber%20FROM%20Account%20WHERE%20Name%20!=%20'Sample%20Account%20for%20Entitlements'
body: null
headers: *id004
response:
status:
code: 200
message: OK
headers: *id006
body:
string: "{\n \"totalSize\" : 10,\n \"done\" : true,\n \"records\" : [ {\n
\ \"attributes\" : {\n \"type\" : \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1RMDQA3\"\n
\ },\n \"Id\" : \"0019H00000H1RMDQA3\",\n \"Name\" : \"Tom Cruise\",\n
\ \"Description\" : \"Some Description\",\n \"Phone\" : \"12345632\",\n
\ \"AccountNumber\" : \"123\"\n }, {\n \"attributes\" : {\n \"type\"
: \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1RMEQA3\"\n
\ },\n \"Id\" : \"0019H00000H1RMEQA3\",\n \"Name\" : \"Bob The Builder\",\n
\ \"Description\" : \"Some Description\",\n \"Phone\" : \"12345632\",\n
\ \"AccountNumber\" : \"123\"\n }, {\n \"attributes\" : {\n \"type\"
: \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1RMFQA3\"\n
\ },\n \"Id\" : \"0019H00000H1RMFQA3\",\n \"Name\" : \"Shah Rukh Khan\",\n
\ \"Description\" : \"Bollywood actor\",\n \"Phone\" : \"12345612\",\n
\ \"AccountNumber\" : \"123\"\n }, {\n \"attributes\" : {\n \"type\"
: \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1RMGQA3\"\n
\ },\n \"Id\" : \"0019H00000H1RMGQA3\",\n \"Name\" : \"Aamir Khan\",\n
\ \"Description\" : \"Mr perfectionist, bollywood actor\",\n \"Phone\"
: \"12345623\",\n \"AccountNumber\" : \"123\"\n }, {\n \"attributes\"
: {\n \"type\" : \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1RMHQA3\"\n
\ },\n \"Id\" : \"0019H00000H1RMHQA3\",\n \"Name\" : \"Salman Khan\",\n
\ \"Description\" : \"Mr perfectionist, bollywood actor\",\n \"Phone\"
: \"12345623\",\n \"AccountNumber\" : \"123\"\n }, {\n \"attributes\"
: {\n \"type\" : \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1UzyQAF\"\n
\ },\n \"Id\" : \"0019H00000H1UzyQAF\",\n \"Name\" : \"Tom Cruise\",\n
\ \"Description\" : \"Some Description\",\n \"Phone\" : \"12345632\",\n
\ \"AccountNumber\" : \"123\"\n }, {\n \"attributes\" : {\n \"type\"
: \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1UzzQAF\"\n
\ },\n \"Id\" : \"0019H00000H1UzzQAF\",\n \"Name\" : \"Bob The Builder\",\n
\ \"Description\" : \"Some Description\",\n \"Phone\" : \"12345632\",\n
\ \"AccountNumber\" : \"123\"\n }, {\n \"attributes\" : {\n \"type\"
: \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1V00QAF\"\n
\ },\n \"Id\" : \"0019H00000H1V00QAF\",\n \"Name\" : \"Shah Rukh Khan\",\n
\ \"Description\" : \"Bollywood actor\",\n \"Phone\" : \"12345612\",\n
\ \"AccountNumber\" : \"123\"\n }, {\n \"attributes\" : {\n \"type\"
: \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1V01QAF\"\n
\ },\n \"Id\" : \"0019H00000H1V01QAF\",\n \"Name\" : \"Aamir Khan\",\n
\ \"Description\" : \"Mr perfectionist, bollywood actor\",\n \"Phone\"
: \"12345623\",\n \"AccountNumber\" : \"123\"\n }, {\n \"attributes\"
: {\n \"type\" : \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1V02QAF\"\n
\ },\n \"Id\" : \"0019H00000H1V02QAF\",\n \"Name\" : \"Salman Khan\",\n
\ \"Description\" : \"Mr perfectionist, bollywood actor\",\n \"Phone\"
: \"12345623\",\n \"AccountNumber\" : \"123\"\n } ]\n}"




- request:
method: POST
uri: https://orgname.my.salesforce.com/services/data/v62.0/composite/sobjects
body: '{"allOrNone": false, "records": [{"LastName": "Contact of Tom Cruise",
"AccountId": "0019H00000H1RMDQA3", "attributes": {"type": "Contact"}}, {"LastName":
"Contact of Bob the Builder", "AccountId": "0019H00000H1RMDQA3", "attributes":
{"type": "Contact"}}, {"LastName": "Contact of SRK", "AccountId": "0019H00000H1RMDQA3",
"attributes": {"type": "Contact"}}]}'
headers: *id004
response:
status:
code: 200
message: OK
headers: *id006
body:
string: "[ {\n \"id\" : \"0039H00000BbbFBQAZ\",\n \"success\" : true,\n \"errors\"
: [ ]\n}, {\n \"id\" : \"0039H00000BbbFCQAZ\",\n \"success\" : true,\n \"errors\"
: [ ]\n}, {\n \"id\" : \"0039H00000BbbFDQAZ\",\n \"success\" : true,\n \"errors\"
: [ ]\n} ]"


- request:
method: GET
uri: https://orgname.my.salesforce.com/services/data/v62.0/query/?q=SELECT%20Id%20FROM%20Account%20WHERE%20Name%20!=%20'Sample%20Account%20for%20Entitlements'%20LIMIT%205
body: null
headers: *id004
response:
status:
code: 200
message: OK
headers: *id006
body:
string: "{\n \"totalSize\" : 5,\n \"done\" : true,\n \"records\" : [ {\n
\ \"attributes\" : {\n \"type\" : \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1RMDQA3\"\n
\ },\n \"Id\" : \"0019H00000H1RMDQA3\"\n }, {\n \"attributes\" :
{\n \"type\" : \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1RMEQA3\"\n
\ },\n \"Id\" : \"0019H00000H1RMDQA3\"\n }, {\n \"attributes\" :
{\n \"type\" : \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1RMFQA3\"\n
\ },\n \"Id\" : \"0019H00000H1RMDQA3\"\n }, {\n \"attributes\" :
{\n \"type\" : \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1RMGQA3\"\n
\ },\n \"Id\" : \"0019H00000H1RMDQA3\"\n }, {\n \"attributes\" :
{\n \"type\" : \"Account\",\n \"url\" : \"/services/data/v62.0/sobjects/Account/0019H00000H1RMHQA3\"\n
\ },\n \"Id\" : \"0019H00000H1RMDQA3\"\n } ]\n}"
Loading

0 comments on commit 5b26be5

Please sign in to comment.