Skip to content

Commit

Permalink
remove shared encodings in columns
Browse files Browse the repository at this point in the history
  • Loading branch information
diegomarvid committed Jul 17, 2024
1 parent d37c717 commit b5f97b8
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 177 deletions.
14 changes: 8 additions & 6 deletions examples/delay/configs/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,10 @@
"step_type": "EncodeStep",
"parameters": {
"feature_encoders": {
"shared_origin_destination": {
"features": [
"Origin",
"Destination"
],
"Origin": {
"encoder": "OrdinalEncoder"
},
"Destination": {
"encoder": "OrdinalEncoder"
}
}
Expand All @@ -67,7 +66,10 @@
"step_type": "CalculateMetricsStep"
},
{
"step_type": "ExplainerDashboardStep"
"step_type": "ExplainerDashboardStep",
"parameters": {
"enable_step": false
}
}
]
}
Expand Down
190 changes: 24 additions & 166 deletions ml_garden/core/steps/encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,7 @@ def __init__(
self.feature_encoders = feature_encoders or {}

def execute(self, data: DataContainer) -> DataContainer:
"""Execute the encoding step.
Parameters
----------
data : DataContainer
The data container
Returns
-------
DataContainer
The updated data container
"""
"""Execute the encoding step."""
self.logger.info("Encoding data")

target_column_name = data.target
Expand Down Expand Up @@ -125,6 +116,7 @@ def _apply_encoding(
log: Optional[bool] = False,
) -> Tuple[pd.DataFrame, Optional[pd.Series], Optional[ColumnTransformer]]:
"""Apply the encoding to the data.
Parameters
----------
df : pd.DataFrame
Expand All @@ -143,6 +135,7 @@ def _apply_encoding(
The saved encoder, by default None
log : Optional[bool], optional
Whether to log information about the features, by default False
Returns
-------
Tuple[pd.DataFrame, Optional[pd.Series], Optional[ColumnTransformer]]
Expand Down Expand Up @@ -199,18 +192,7 @@ def _apply_encoding(
def _get_feature_types(
self, df: pd.DataFrame, target_column_name: str
) -> Tuple[List[str], List[str]]:
"""Get categorical and numeric feature lists.
Parameters
----------
df : pd.DataFrame
The DataFrame
target_column_name : str
The target column name
Returns
-------
Tuple[List[str], List[str]]
Categorical and numeric features
"""
"""Get categorical and numeric feature lists."""
categorical_features = [
col
for col in df.columns
Expand All @@ -227,18 +209,7 @@ def _get_feature_types(
def _split_categorical_features(
self, df: pd.DataFrame, categorical_features: List[str]
) -> Tuple[List[str], List[str]]:
"""Split categorical features into low and high cardinality features.
Parameters
----------
df : pd.DataFrame
The DataFrame
categorical_features : List[str]
Categorical features
Returns
-------
Tuple[List[str], List[str]]
Low and high cardinality features
"""
"""Split categorical features into low and high cardinality features."""
low_cardinality_features = [
col for col in categorical_features if df[col].nunique() <= self.cardinality_threshold
]
Expand All @@ -250,35 +221,24 @@ def _split_categorical_features(
def _get_encoder_class_and_params(
self, encoder_name: str
) -> Tuple[Union[Type[OrdinalEncoder], Type[TargetEncoder]], dict[str, Any]]:
"""Map encoder name to the corresponding encoder class.
Parameters
----------
encoder_name : str
The encoder name
Returns
-------
Tuple[Union[Type[OrdinalEncoder], Type[TargetEncoder]], dict[str, Any]]
The encoder class and the encoder parameters
"""

"""Map encoder name to the corresponding encoder class."""
encoder = self.ENCODER_MAP.get(encoder_name)
encoder_params = self.ENCODER_MAP_PARAMS.get(encoder_name)

if not encoder:
if not encoder or encoder_params is None:
raise ValueError(
f"Unsupported encoder: {encoder_name}. Supported encoders:"
f" {list(self.ENCODER_MAP.keys())}"
)
if encoder_params is None:
raise ValueError(
f"Unsupported encoder: {encoder_name}. Supported encoders:"
f" {list(self.ENCODER_MAP_PARAMS.keys())}"
)

return encoder, encoder_params

def _log_encoder_override(
self, feature, encoder_class, high_cardinality_features, low_cardinality_features
self,
feature: str,
encoder_class: Type[Union[OrdinalEncoder, TargetEncoder]],
high_cardinality_features: List[str],
low_cardinality_features: List[str],
):
if feature in high_cardinality_features:
self.logger.info(
Expand All @@ -301,38 +261,19 @@ def _create_column_transformer(
low_cardinality_features: List[str],
numeric_features: List[str],
) -> ColumnTransformer:
"""Create a ColumnTransformer for encoding."""
transformers = []
encoded_features = set()

# Handle shared encoder for Origin and Destination
if "shared_origin_destination" in self.feature_encoders:
shared_config = self.feature_encoders["shared_origin_destination"]
shared_features = shared_config["features"]
encoder_class, encoder_params = self._get_encoder_class_and_params(
shared_config["encoder"]
)
encoder_params.update(shared_config.get("params", {}))
shared_encoder = encoder_class(**encoder_params)

transformers.append(("shared_origin_destination", shared_encoder, shared_features))
encoded_features.update(shared_features)

for feature in shared_features:
self._log_encoder_override(
feature, encoder_class, high_cardinality_features, low_cardinality_features
)

# Handle other categorical features
remaining_categorical = (
set(high_cardinality_features + low_cardinality_features) - encoded_features
)
for feature in remaining_categorical:
for feature in high_cardinality_features + low_cardinality_features:
if feature in self.feature_encoders:
encoder_config = self.feature_encoders[feature]
encoder_class, encoder_params = self._get_encoder_class_and_params(
encoder_config["encoder"]
)
encoder_params.update(encoder_config.get("params", {}))
self._log_encoder_override(
feature, encoder_class, high_cardinality_features, low_cardinality_features
)
elif feature in high_cardinality_features:
encoder_class, encoder_params = self._get_encoder_class_and_params(
self.HIGH_CARDINALITY_ENCODER
Expand All @@ -344,9 +285,7 @@ def _create_column_transformer(

encoder = encoder_class(**encoder_params)
transformers.append((f"{feature}_encoder", encoder, [feature]))
encoded_features.add(feature)

# Handle numeric features
if numeric_features:
transformers.append(("numeric", "passthrough", numeric_features))

Expand All @@ -363,22 +302,7 @@ def _transform_data(
column_transformer: ColumnTransformer,
is_train: bool = False,
) -> tuple[pd.DataFrame, Optional[pd.Series]]:
"""Transform the data using the ColumnTransformer.
Parameters
----------
df : pd.DataFrame
The DataFrame to transform
target_column_name : str
The target column name
column_transformer : ColumnTransformer
The ColumnTransformer
is_train : bool, optional
Whether the data is for training, by default False
Returns
-------
tuple[pd.DataFrame, Optional[pd.Series]]
The transformed data and the target column
"""
"""Transform the data using the ColumnTransformer."""
if target_column_name in df.columns:
X = df.drop(columns=[target_column_name]) # Drop the target column
y = df[target_column_name] # Target column for training data
Expand All @@ -400,36 +324,14 @@ def _transform_data(
)

def _restore_column_order(self, df: pd.DataFrame, encoded_data: pd.DataFrame) -> pd.DataFrame:
"""Restore the original column order.
Parameters
----------
df : pd.DataFrame
The original DataFrame
encoded_data : pd.DataFrame
The encoded DataFrame
Returns
-------
pd.DataFrame
The encoded DataFrame with the original column order
"""
"""Restore the original column order."""
new_column_order = [col for col in df.columns if col in encoded_data.columns]
return encoded_data[new_column_order]

def _convert_ordinal_encoded_columns_to_int(
self, encoded_data: pd.DataFrame, encoded_feature_map: Dict[str, str]
) -> pd.DataFrame:
"""Convert ordinal encoded columns to the smallest possible integer dtype.
Parameters
----------
encoded_data : pd.DataFrame
The encoded DataFrame
encoded_feature_map : Dict[str, str]
The feature encoder map
Returns
-------
pd.DataFrame
The encoded DataFrame with ordinal encoded columns converted to int
"""
"""Convert ordinal encoded columns to the smallest possible integer dtype."""
ordinal_encoded_features = [
col for col, encoder in encoded_feature_map.items() if encoder == "OrdinalEncoder"
]
Expand All @@ -455,18 +357,7 @@ def _convert_ordinal_encoded_columns_to_int(
def _restore_numeric_dtypes(
self, encoded_data: pd.DataFrame, original_numeric_dtypes: dict
) -> pd.DataFrame:
"""Restore original dtypes of numeric features.
Parameters
----------
encoded_data : pd.DataFrame
The encoded DataFrame
original_numeric_dtypes : dict
The original numeric dtypes
Returns
-------
pd.DataFrame
The encoded DataFrame with the original numeric dtypes
"""
"""Restore original dtypes of numeric features."""
for col, dtype in original_numeric_dtypes.items():
if col in encoded_data.columns:
try:
Expand All @@ -478,38 +369,18 @@ def _restore_numeric_dtypes(
return encoded_data

def _convert_float64_to_float32(self, encoded_data: pd.DataFrame) -> pd.DataFrame:
"""Convert float64 columns to float32.
Parameters
----------
encoded_data : pd.DataFrame
The encoded DataFrame
Returns
-------
pd.DataFrame
The encoded DataFrame with float64 columns converted to float32
"""
"""Convert float64 columns to float32."""
float64_columns = encoded_data.select_dtypes(include=["float64"]).columns
for col in float64_columns:
encoded_data[col] = encoded_data[col].astype(np.float32)
return encoded_data

def _create_feature_encoder_map(self, column_transformer: ColumnTransformer) -> Dict[str, str]:
"""
Create a dictionary to store the encoder used for each feature.
Parameters
----------
column_transformer : ColumnTransformer
The ColumnTransformer
Returns
-------
Dict[str, str]
The feature encoder map
"""
"""Create a dictionary to store the encoder used for each feature."""
feature_encoder_map = {}
transformed_features = column_transformer.get_feature_names_out()

for transformer_name, transformer, features in column_transformer.transformers_:

encoder_name = (
"PassThrough" if transformer_name == "numeric" else transformer.__class__.__name__
)
Expand All @@ -528,20 +399,7 @@ def _log_feature_info(
high_cardinality_features: List[str],
feature_encoder_map: Dict[str, str],
) -> None:
"""Log information about the features.
Parameters
----------
categorical_features : List[str]
Categorical features
numeric_features : List[str]
Numeric features
low_cardinality_features : List[str]
Low cardinality features
high_cardinality_features : List[str]
High cardinality features
feature_encoder_map : Dict[str, str]
The feature encoder map
"""
"""Log information about the features."""
self.logger.info(
f"Categorical features: ({len(categorical_features)}) - {categorical_features}"
)
Expand Down
9 changes: 4 additions & 5 deletions tests/data/test_classification.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,10 @@
"step_type": "EncodeStep",
"parameters": {
"feature_encoders": {
"shared_origin_destination": {
"features": [
"Origin",
"Destination"
],
"Origin": {
"encoder": "OrdinalEncoder"
},
"Destination": {
"encoder": "OrdinalEncoder"
}
}
Expand Down

0 comments on commit b5f97b8

Please sign in to comment.