Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds support to specify categorical features in lgbm learner #197

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
12 changes: 9 additions & 3 deletions src/fklearn/training/classification.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Any
from typing import List, Any, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -502,6 +502,7 @@ def lgbm_classification_learner(df: pd.DataFrame,
learning_rate: float = 0.1,
num_estimators: int = 100,
extra_params: LogType = None,
categorical_features: Union[List[str], str] = "auto",
prediction_column: str = "prediction",
weight_column: str = None,
encode_extra_cols: bool = True) -> LearnerReturnType:
Expand Down Expand Up @@ -549,6 +550,11 @@ def lgbm_classification_learner(df: pd.DataFrame,
https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
If not passed, the default will be used.

categorical_features : list of str, or 'auto', optional (default="auto")
A list of column names that should be treated as categorical features.
See the categorical_feature hyper-parameter in:
https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst

prediction_column : str
The name of the column with the predictions from the model.

Expand All @@ -569,8 +575,8 @@ def lgbm_classification_learner(df: pd.DataFrame,

features = features if not encode_extra_cols else expand_features_encoded(df, features)

dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights,
silent=True)
dtrain = lgbm.Dataset(df[features], label=df[target], feature_name=list(map(str, features)), weight=weights,
silent=True, categorical_feature=categorical_features)

bst = lgbm.train(params, dtrain, num_estimators)

Expand Down
48 changes: 48 additions & 0 deletions tests/training/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,20 @@ def test_lgbm_classification_learner():
'y': [1, 2, 0, 1, 2, 0]
})

df_train_categorical = pd.DataFrame({
"id": ["id1", "id2", "id3", "id4"],
"x1": [1, 1, 1, 1],
"x2": [3, 4, 4, 3],
"y": [0, 1, 1, 0],
})

df_test_categorical = pd.DataFrame({
"id": ["id5", "id6", "id7", "id8"],
"x1": [0, 0, 0, 0],
"x2": [3, 3, 4, 4],
"y": [0, 0, 1, 1],
})

features = ["x1", "x2"]

learner_binary = lgbm_classification_learner(features=features,
Expand All @@ -446,6 +460,7 @@ def test_lgbm_classification_learner():
assert pred_test_binary.prediction.max() < 1
assert pred_test_binary.prediction.min() > 0
assert (pred_test_binary.columns == pred_train_binary.columns).all()
assert all(tree['num_cat'] == 0 for tree in log['object'].dump_model()['tree_info'])

# SHAP test
pred_shap = predict_fn_binary(df_test_binary, apply_shap=True)
Expand Down Expand Up @@ -474,6 +489,7 @@ def test_lgbm_classification_learner():
assert Counter(expected_col_train) == Counter(pred_train_multinomial.columns.tolist())
assert Counter(expected_col_test) == Counter(pred_test_multinomial.columns.tolist())
assert (pred_test_multinomial.columns == pred_train_multinomial.columns).all()
assert all(tree['num_cat'] == 0 for tree in log['object'].dump_model()['tree_info'])

# SHAP test multinomial
pred_shap_multinomial = predict_fn_multinomial(df_test_multinomial, apply_shap=True)
Expand All @@ -482,3 +498,35 @@ def test_lgbm_classification_learner():
["shap_expected_value_0", "shap_expected_value_1", "shap_expected_value_2"]
assert Counter(expected_col_shap) == Counter(pred_shap_multinomial.columns.tolist())
assert np.vstack(pred_shap_multinomial["shap_values_0"]).shape == (6, 2)

learner_binary = lgbm_classification_learner(features=features,
target="y",
learning_rate=0.1,
num_estimators=1,
categorical_features=["x2"],
extra_params={"max_depth": 2,
"min_data_in_leaf": 1,
"min_data_per_group": 1,
"seed": 42,
"objective": "binary"},
prediction_column="prediction")

predict_fn_categorical, pred_train_categorical, log = learner_binary(df_train_categorical)

pred_test_categorical = predict_fn_categorical(df_test_categorical)

expected_col_train = df_train_categorical.columns.tolist() + ["prediction"]
expected_col_test = df_test_categorical.columns.tolist() + ["prediction"]

assert Counter(expected_col_train) == Counter(pred_train_categorical.columns.tolist())
assert Counter(expected_col_test) == Counter(pred_test_categorical.columns.tolist())
assert pred_test_categorical.prediction.max() < 1
assert pred_test_categorical.prediction.min() > 0
assert (pred_test_binary.columns == pred_train_binary.columns).all()
assert any(tree['num_cat'] > 0 for tree in log['object'].dump_model()['tree_info'])

# SHAP test
pred_shap = predict_fn_categorical(df_test_categorical, apply_shap=True)
assert "shap_values" in pred_shap.columns
assert "shap_expected_value" in pred_shap.columns
assert np.vstack(pred_shap["shap_values"]).shape == (4, 2)