Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds support to specify categorical features in lgbm learner #197

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
20 changes: 13 additions & 7 deletions src/fklearn/training/classification.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Any
from typing import List, Any, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -502,6 +502,7 @@ def lgbm_classification_learner(df: pd.DataFrame,
learning_rate: float = 0.1,
num_estimators: int = 100,
extra_params: LogType = None,
categorical_features: Union[List[str], str] = "auto",
prediction_column: str = "prediction",
weight_column: str = None,
encode_extra_cols: bool = True) -> LearnerReturnType:
Expand Down Expand Up @@ -549,6 +550,11 @@ def lgbm_classification_learner(df: pd.DataFrame,
https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
If not passed, the default will be used.

categorical_features : list of str, or 'auto', optional (default="auto")
A list of column names that should be treated as categorical features.
See the categorical_feature hyper-parameter in:
https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst

prediction_column : str
The name of the column with the predictions from the model.

Expand All @@ -565,21 +571,21 @@ def lgbm_classification_learner(df: pd.DataFrame,
params = assoc(params, "eta", learning_rate)
params = params if "objective" in params else assoc(params, "objective", 'binary')

weights = df[weight_column].values if weight_column else None
weights = df[weight_column] if weight_column else None

features = features if not encode_extra_cols else expand_features_encoded(df, features)

dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights,
silent=True)
dtrain = lgbm.Dataset(df[features], label=df[target], feature_name=list(map(str, features)), weight=weights,
silent=True, categorical_feature=categorical_features)

bst = lgbm.train(params, dtrain, num_estimators)
bst = lgbm.train(params, dtrain, num_estimators, categorical_feature=categorical_features)

def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
if params["objective"] == "multiclass":
col_dict = {prediction_column + "_" + str(key): value
for (key, value) in enumerate(bst.predict(new_df[features].values).T)}
for (key, value) in enumerate(bst.predict(new_df[features]).T)}
else:
col_dict = {prediction_column: bst.predict(new_df[features].values)}
col_dict = {prediction_column: bst.predict(new_df[features])}

if apply_shap:
import shap
Expand Down
16 changes: 11 additions & 5 deletions src/fklearn/training/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,7 @@ def lgbm_regression_learner(df: pd.DataFrame,
learning_rate: float = 0.1,
num_estimators: int = 100,
extra_params: Dict[str, Any] = None,
categorical_features: Union[List[str], str] = "auto",
prediction_column: str = "prediction",
weight_column: str = None,
encode_extra_cols: bool = True) -> LearnerReturnType:
Expand Down Expand Up @@ -458,6 +459,11 @@ def lgbm_regression_learner(df: pd.DataFrame,
https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
If not passed, the default will be used.

categorical_features : list of str, or 'auto', optional (default="auto")
A list of column names that should be treated as categorical features.
See the categorical_feature hyper-parameter in:
https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst

prediction_column : str
The name of the column with the predictions from the model.

Expand All @@ -474,17 +480,17 @@ def lgbm_regression_learner(df: pd.DataFrame,
params = assoc(params, "eta", learning_rate)
params = params if "objective" in params else assoc(params, "objective", 'regression')

weights = df[weight_column].values if weight_column else None
weights = df[weight_column] if weight_column else None

features = features if not encode_extra_cols else expand_features_encoded(df, features)

dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights,
silent=True)
dtrain = lgbm.Dataset(df[features], label=df[target], feature_name=list(map(str, features)), weight=weights,
silent=True, categorical_feature=categorical_features)

bst = lgbm.train(params, dtrain, num_estimators)
bst = lgbm.train(params, dtrain, num_estimators, categorical_feature=categorical_features)

def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
col_dict = {prediction_column: bst.predict(new_df[features].values)}
col_dict = {prediction_column: bst.predict(new_df[features])}

if apply_shap:
import shap
Expand Down
48 changes: 48 additions & 0 deletions tests/training/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,20 @@ def test_lgbm_classification_learner():
'y': [1, 2, 0, 1, 2, 0]
})

df_train_categorical = pd.DataFrame({
"id": ["id1", "id2", "id3", "id4"],
"x1": [1, 1, 1, 1],
"x2": [3, 4, 4, 3],
"y": [0, 1, 1, 0],
})

df_test_categorical = pd.DataFrame({
"id": ["id5", "id6", "id7", "id8"],
"x1": [0, 0, 0, 0],
"x2": [3, 3, 4, 4],
"y": [0, 0, 1, 1],
})

features = ["x1", "x2"]

learner_binary = lgbm_classification_learner(features=features,
Expand All @@ -446,6 +460,7 @@ def test_lgbm_classification_learner():
assert pred_test_binary.prediction.max() < 1
assert pred_test_binary.prediction.min() > 0
assert (pred_test_binary.columns == pred_train_binary.columns).all()
assert all(tree['num_cat'] == 0 for tree in log['object'].dump_model()['tree_info'])

# SHAP test
pred_shap = predict_fn_binary(df_test_binary, apply_shap=True)
Expand Down Expand Up @@ -474,6 +489,7 @@ def test_lgbm_classification_learner():
assert Counter(expected_col_train) == Counter(pred_train_multinomial.columns.tolist())
assert Counter(expected_col_test) == Counter(pred_test_multinomial.columns.tolist())
assert (pred_test_multinomial.columns == pred_train_multinomial.columns).all()
assert all(tree['num_cat'] == 0 for tree in log['object'].dump_model()['tree_info'])

# SHAP test multinomial
pred_shap_multinomial = predict_fn_multinomial(df_test_multinomial, apply_shap=True)
Expand All @@ -482,3 +498,35 @@ def test_lgbm_classification_learner():
["shap_expected_value_0", "shap_expected_value_1", "shap_expected_value_2"]
assert Counter(expected_col_shap) == Counter(pred_shap_multinomial.columns.tolist())
assert np.vstack(pred_shap_multinomial["shap_values_0"]).shape == (6, 2)

learner_binary = lgbm_classification_learner(features=features,
target="y",
learning_rate=0.1,
num_estimators=1,
categorical_features=["x2"],
extra_params={"max_depth": 2,
"min_data_in_leaf": 1,
"min_data_per_group": 1,
"seed": 42,
"objective": "binary"},
prediction_column="prediction")

predict_fn_categorical, pred_train_categorical, log = learner_binary(df_train_categorical)

pred_test_categorical = predict_fn_categorical(df_test_categorical)

expected_col_train = df_train_categorical.columns.tolist() + ["prediction"]
expected_col_test = df_test_categorical.columns.tolist() + ["prediction"]

assert Counter(expected_col_train) == Counter(pred_train_categorical.columns.tolist())
assert Counter(expected_col_test) == Counter(pred_test_categorical.columns.tolist())
assert pred_test_categorical.prediction.max() < 1
assert pred_test_categorical.prediction.min() > 0
assert (pred_test_binary.columns == pred_train_binary.columns).all()
assert any(tree['num_cat'] > 0 for tree in log['object'].dump_model()['tree_info'])

# SHAP test
pred_shap = predict_fn_categorical(df_test_categorical, apply_shap=True)
assert "shap_values" in pred_shap.columns
assert "shap_expected_value" in pred_shap.columns
assert np.vstack(pred_shap["shap_values"]).shape == (4, 2)
30 changes: 30 additions & 0 deletions tests/training/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ def test_lgbm_regression_learner():
assert Counter(expected_col_train) == Counter(pred_train.columns.tolist())
assert Counter(expected_col_test) == Counter(pred_test.columns.tolist())
assert (pred_test.columns == pred_train.columns).all()
assert all(tree['num_cat'] == 0 for tree in log['object'].dump_model()['tree_info'])
assert "prediction" in pred_test.columns

# SHAP test
Expand All @@ -177,6 +178,35 @@ def test_lgbm_regression_learner():
assert "shap_expected_value" in pred_shap.columns
assert np.vstack(pred_shap["shap_values"]).shape == (4, 2)

learner = lgbm_regression_learner(features=features,
target="y",
learning_rate=0.1,
num_estimators=1,
categorical_features=["x2"],
extra_params={"max_depth": 2,
"min_data_in_leaf": 1,
"min_data_per_group": 1,
"seed": 42},
prediction_column="prediction")

predict_fn, pred_train, log = learner(df_train)

pred_test = predict_fn(df_test)

expected_col_train = df_train.columns.tolist() + ["prediction"]
expected_col_test = df_test.columns.tolist() + ["prediction"]

assert Counter(expected_col_train) == Counter(pred_train.columns.tolist())
assert Counter(expected_col_test) == Counter(pred_test.columns.tolist())
assert (pred_test.columns == pred_train.columns).all()
assert any(tree['num_cat'] > 0 for tree in log['object'].dump_model()['tree_info'])

# SHAP test
pred_shap = predict_fn(df_test, apply_shap=True)
assert "shap_values" in pred_shap.columns
assert "shap_expected_value" in pred_shap.columns
assert np.vstack(pred_shap["shap_values"]).shape == (4, 2)


def test_catboost_regressor_learner():
df_train = pd.DataFrame({
Expand Down