Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enforce codestyle using pre-commit and some related fixes #191

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
default_stages: [commit, push]
repos:
- repo: https://github.com/pycqa/isort
rev: 5.10.1
hooks:
- id: isort
name: isort (python)
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
hooks:
- id: check-added-large-files
- id: debug-statements
- id: double-quote-string-fixer
- id: end-of-file-fixer
- id: mixed-line-ending
- id: check-yaml
- id: requirements-txt-fixer
- id: mixed-line-ending
- id: trailing-whitespace
- repo: https://github.com/pre-commit/mirrors-autopep8
rev: "v1.6.0"
hooks:
- id: autopep8
- repo: https://github.com/pycqa/flake8
rev: 3.9.2
hooks:
- id: flake8
name: flake8 (python)
- repo: https://github.com/myint/docformatter
rev: v1.4
hooks:
- id: docformatter
args: ["--wrap-descriptions", "0"]
5 changes: 2 additions & 3 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@
# -- Project information -----------------------------------------------------

project = 'fklearn'
current_year = str(datetime.now().year)
copyright = current_year + ', Nubank Data Science Team'
copyright = f'{datetime.now():%Y}, Nubank Data Science Team'
author = 'Nubank Data Science Team'

# The short X.Y version
Expand Down Expand Up @@ -73,7 +72,7 @@
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path .
exclude_patterns = ["examples/.ipynb_checkpoints/*"]
exclude_patterns = ['examples/.ipynb_checkpoints/*']

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
Expand Down
9 changes: 4 additions & 5 deletions docs/source/examples/feature_transformation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,22 +39,22 @@ Below we present an example of encoder usage, applying a ``target_categorizer``
df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c'])

# Replace
pipe = target_categorizer(columns_to_categorize=['b', 'a'],
pipe = target_categorizer(columns_to_categorize=['b', 'a'],
target_column='c')
p, p_df0, log = pipe(df)

# Store originals in columns listed in a dict
pipe = target_categorizer(columns_to_categorize=['b', 'a'],
pipe = target_categorizer(columns_to_categorize=['b', 'a'],
target_column='c', columns_mapping={'b': 'b_raw'})
p, p_df1, log = pipe(df)

# Add prefix to the columns with original values
pipe = target_categorizer(columns_to_categorize=['b', 'a'],
pipe = target_categorizer(columns_to_categorize=['b', 'a'],
target_column='c', prefix='raw__')
p, p_df2, log = pipe(df)

# Add suffix to the columns with original values
pipe = target_categorizer(columns_to_categorize=['b', 'a'],
pipe = target_categorizer(columns_to_categorize=['b', 'a'],
target_column='c', suffix='__raw')
p, p_df3, log = pipe(df)

Expand All @@ -63,4 +63,3 @@ Below we present an example of encoder usage, applying a ``target_categorizer``
print(p_df1)
print(p_df2)
print(p_df3)

4 changes: 2 additions & 2 deletions docs/source/getting_started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ You can also install from the source::

# clone the repository
$ git clone -b master https://github.com/nubank/fklearn.git --depth=1

# open the folder
$ cd fklearn

# install the dependencies
$ pip install -e .

Expand Down
2 changes: 1 addition & 1 deletion requirements_demos.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
matplotlib>=3.0.2,<4
tqdm>=4.32.1,<5
scipy>=1.2.1,<2
tqdm>=4.32.1,<5
6 changes: 3 additions & 3 deletions requirements_test.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
codecov>=2.0,<3
hypothesis>=5.5.4,<7
mypy>=0.670,<1
pytest>=4.2.1,<7
pytest-cov>=2.6.1,<3
pytest-xdist>=1.26.1,<3
mypy>=0.670,<1
codecov>=2.0,<3
hypothesis>=5.5.4,<7
46 changes: 25 additions & 21 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,53 @@
#!/usr/bin/env python
from os.path import join

from setuptools import setup, find_packages
from setuptools import find_packages, setup

MODULE_NAME = 'fklearn' # package name used to install via pip (as shown in `pip freeze` or `conda list`)
MODULE_NAME_IMPORT = 'fklearn' # this is how this module is imported in Python (name of the folder inside `src`)
REPO_NAME = 'fklearn' # repository name


def requirements_from_pip(filename='requirements.txt'):
with open(filename, 'r') as pip:
return [l.strip() for l in pip if not l.startswith('#') and l.strip()]
with open(filename, 'r') as requirements_file:
return [
req.strip()
for req in requirements_file
if not req.startswith('#') and req.strip()
]


core_deps = requirements_from_pip()
demos_deps = requirements_from_pip("requirements_demos.txt")
test_deps = requirements_from_pip("requirements_test.txt")
demos_deps = requirements_from_pip('requirements_demos.txt')
test_deps = requirements_from_pip('requirements_test.txt')

tools_deps = requirements_from_pip("requirements_tools.txt")
tools_deps = requirements_from_pip('requirements_tools.txt')

lgbm_deps = requirements_from_pip("requirements_lgbm.txt")
xgboost_deps = requirements_from_pip("requirements_xgboost.txt")
catboost_deps = requirements_from_pip("requirements_catboost.txt")
lgbm_deps = requirements_from_pip('requirements_lgbm.txt')
xgboost_deps = requirements_from_pip('requirements_xgboost.txt')
catboost_deps = requirements_from_pip('requirements_catboost.txt')

all_models_deps = lgbm_deps + xgboost_deps + catboost_deps
all_deps = all_models_deps + tools_deps
devel_deps = test_deps + all_deps

setup(name=MODULE_NAME,
description="Functional machine learning",
url='https://github.com/nubank/{:s}'.format(REPO_NAME),
author="Nubank",
description='Functional machine learning',
url=f'https://github.com/nubank/{REPO_NAME}',
author='Nubank',
package_dir={'': 'src'},
packages=find_packages('src'),
version=(open(join('src', MODULE_NAME, 'resources', 'VERSION'))
.read().strip()),
install_requires=core_deps,
extras_require={"test_deps": test_deps,
"lgbm": lgbm_deps,
"xgboost": xgboost_deps,
"catboost": catboost_deps,
"tools": tools_deps,
"devel": devel_deps,
"all_models": all_models_deps,
"all": all_deps},
extras_require={'test_deps': test_deps,
'lgbm': lgbm_deps,
'xgboost': xgboost_deps,
'catboost': catboost_deps,
'tools': tools_deps,
'devel': devel_deps,
'all_models': all_models_deps,
'all': all_deps},
include_package_data=True,
zip_safe=False,
classifiers=['Programming Language :: Python :: 3.6'])

2 changes: 1 addition & 1 deletion src/fklearn/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .version import __version__
from .version import __version__ # noqa: F401
15 changes: 8 additions & 7 deletions src/fklearn/causal/validation/auc.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import pandas as pd
from toolz import curry

from fklearn.types import EffectFnType
from fklearn.causal.validation.curves import cumulative_effect_curve
from fklearn.causal.effects import linear_effect
from fklearn.causal.validation.curves import cumulative_effect_curve
from fklearn.types import EffectFnType


@curry
Expand Down Expand Up @@ -57,7 +57,8 @@ def area_under_the_cumulative_effect_curve(df: pd.DataFrame,
cum_effect = cumulative_effect_curve(df=df, treatment=treatment, outcome=outcome, prediction=prediction,
min_rows=min_rows, steps=steps, effect_fn=effect_fn)

return abs(sum([(effect - ate) * (step_size / size) for effect, step_size in zip(cum_effect, step_sizes)]))
return abs(sum((effect - ate) * (step_size / size)
for effect, step_size in zip(cum_effect, step_sizes)))
erickisos marked this conversation as resolved.
Show resolved Hide resolved


@curry
Expand Down Expand Up @@ -109,8 +110,8 @@ def area_under_the_cumulative_gain_curve(df: pd.DataFrame,
cum_effect = cumulative_effect_curve(df=df, treatment=treatment, outcome=outcome, prediction=prediction,
min_rows=min_rows, steps=steps, effect_fn=effect_fn)

return abs(sum([effect * (rows / size) * (step_size / size)
for rows, effect, step_size in zip(n_rows, cum_effect, step_sizes)]))
return abs(sum(effect * (rows / size) * (step_size / size)
for rows, effect, step_size in zip(n_rows, cum_effect, step_sizes)))


@curry
Expand Down Expand Up @@ -164,5 +165,5 @@ def area_under_the_relative_cumulative_gain_curve(df: pd.DataFrame,
cum_effect = cumulative_effect_curve(df=df, treatment=treatment, outcome=outcome, prediction=prediction,
min_rows=min_rows, steps=steps, effect_fn=effect_fn)

return abs(sum([(effect - ate) * (rows / size) * (step_size / size)
for rows, effect, step_size in zip(n_rows, cum_effect, step_sizes)]))
return abs(sum((effect - ate) * (rows / size) * (step_size / size)
for rows, effect, step_size in zip(n_rows, cum_effect, step_sizes)))
33 changes: 10 additions & 23 deletions src/fklearn/causal/validation/cate.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,12 @@ def _validate_test_and_control_groups(test_data: pd.DataFrame,
unique_values = test_data[group_column].unique()

if control_group_name not in unique_values:
raise ValueError("control group '{}' not found".format(control_group_name))
raise ValueError(f"control group '{control_group_name}' not found")

n_groups = len(unique_values)
if n_groups != 2:
raise RuntimeError(
"Exactly 2 groups are required for delta evaluations. found {}".format(
n_groups
)
)
return (
unique_values[0] if control_group_name == unique_values[1] else unique_values[1]
)
raise RuntimeError(f'Exactly 2 groups are required for delta evaluations. found {n_groups}')
return unique_values[0] if control_group_name == unique_values[1] else unique_values[1]


def cate_mean_by_bin(test_data: pd.DataFrame,
Expand Down Expand Up @@ -95,8 +89,8 @@ def cate_mean_by_bin(test_data: pd.DataFrame,

test_after_control = test_group_name > control_group_name

quantile_column = bin_column + "_q" + str(n_bins)
duplicates = "drop" if allow_dropped_bins else "raise"
quantile_column = f'{bin_column}_q{n_bins}'
duplicates = 'drop' if allow_dropped_bins else 'raise'
test_data_binned = test_data.assign(
**{
quantile_column: pd.qcut(
Expand Down Expand Up @@ -125,8 +119,8 @@ def cate_mean_by_bin_meta_evaluator(test_data: pd.DataFrame,
allow_dropped_bins: bool = False,
inner_evaluator: UncurriedEvalFnType = r2_evaluator,
eval_name: str = None,
prediction_column: str = "prediction",
target_column: str = "target") -> EvalReturnType:
prediction_column: str = 'prediction',
target_column: str = 'target') -> EvalReturnType:
"""
Evaluates the predictions of a causal model that outputs treatment outcomes w.r.t. its capabilities to predict the
CATE.
Expand Down Expand Up @@ -184,19 +178,12 @@ def cate_mean_by_bin_meta_evaluator(test_data: pd.DataFrame,
)
except ValueError:
raise ValueError(
"can't create {} bins for column '{}'. use 'allow_dropped_bins=True' to drop duplicated bins".format(
n_bins, bin_column
)
f"can't create {n_bins} bins for column '{bin_column}'."
" Use 'allow_dropped_bins=True' to drop duplicated bins"
)

if eval_name is None:
eval_name = (
"cate_mean_by_bin_"
+ bin_column
+ "[{}q]".format(n_bins)
+ "__"
+ inner_evaluator.__name__
)
eval_name = f'cate_mean_by_bin_{bin_column}[{n_bins}q]__{inner_evaluator.__name__}'

return inner_evaluator(
test_data=gb,
Expand Down
18 changes: 9 additions & 9 deletions src/fklearn/metrics/pd_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from itertools import chain, repeat

import pandas as pd
from toolz import curry
from numpy import nan
from toolz import curry


@curry
Expand All @@ -21,9 +21,9 @@ def combined_evaluator_extractor(result, base_extractors):
@curry
def split_evaluator_extractor_iteration(split_value, result, split_col, base_extractor, eval_name=None):
if eval_name is None:
eval_name = 'split_evaluator__' + split_col
eval_name = f'split_evaluator__{split_col}'

key = eval_name + '_' + str(split_value)
key = f'{eval_name}_{str(split_value)}'

return (base_extractor(result.get(key, {}))
.assign(**{eval_name: split_value}))
Expand All @@ -38,9 +38,9 @@ def split_evaluator_extractor(result, split_col, split_values, base_extractor, e


@curry
def temporal_split_evaluator_extractor(result, time_col, base_extractor, time_format="%Y-%m", eval_name=None):
def temporal_split_evaluator_extractor(result, time_col, base_extractor, time_format='%Y-%m', eval_name=None):
if eval_name is None:
eval_name = 'split_evaluator__' + time_col
eval_name = f'split_evaluator__{time_col}'

split_keys = [key for key in result.keys() if eval_name in key]
split_values = []
Expand Down Expand Up @@ -117,8 +117,8 @@ def extract_sc(validator_results, extractor):

@curry
def extract_param_tuning_iteration(iteration, tuning_log, base_extractor, model_learner_name):
iter_df = base_extractor(tuning_log[iteration]["validator_log"])
return iter_df.assign(**tuning_log[iteration]["train_log"][model_learner_name]["parameters"])
iter_df = base_extractor(tuning_log[iteration]['validator_log'])
return iter_df.assign(**tuning_log[iteration]['train_log'][model_learner_name]['parameters'])


@curry
Expand All @@ -134,8 +134,8 @@ def permutation_extractor(results, base_extractor):
df.index = results['permutation_importance'].keys()
if 'permutation_importance_baseline' in results: # With baseline comparison
baseline = base_extractor(results['permutation_importance_baseline'])
baseline.index = ["baseline"]
baseline.index = ['baseline']
df = pd.concat((df, baseline))
for c in baseline.columns:
df[c + '_delta_from_baseline'] = baseline[c].iloc[0] - df[c]
df[f'{c}_delta_from_baseline'] = baseline[c].iloc[0] - df[c]
return df
8 changes: 4 additions & 4 deletions src/fklearn/preprocessing/rebalancing.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def rebalance_by_categorical(dataset: pd.DataFrame, categ_column: str, max_lines
"""

categs = dataset[categ_column].value_counts().to_dict()
max_lines_by_categ = max_lines_by_categ if max_lines_by_categ else min(categs.values())
max_lines_by_categ = max_lines_by_categ or min(categs.values())

return pd.concat([(dataset
.loc[dataset[categ_column] == categ, :]
Expand Down Expand Up @@ -69,11 +69,11 @@ def rebalance_by_continuous(dataset: pd.DataFrame, continuous_column: str, bucke
A dataset with fewer lines than dataset, but with the same number of lines per category in categ_column
"""

bin_fn = partial(pd.qcut, q=buckets, duplicates="drop") if by_quantile else partial(pd.cut, bins=buckets)
bin_fn = partial(pd.qcut, q=buckets, duplicates='drop') if by_quantile else partial(pd.cut, bins=buckets)

return (dataset
.assign(bins=bin_fn(dataset[continuous_column]))
.pipe(rebalance_by_categorical(categ_column="bins",
.pipe(rebalance_by_categorical(categ_column='bins',
max_lines_by_categ=max_lines_by_categ,
seed=seed))
.drop(columns=["bins"]))
.drop(columns=['bins']))
Loading