Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] variable arguments support in select #1288

Merged
merged 17 commits into from
Oct 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,4 @@ Contributors
- [@asmirnov69](https://github.com/asmirnov69) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%asmirnov69)
- [@xujiboy](https://github.com/xujiboy) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%xujiboy)
- [@joranbeasley](https://github.com/joranbeasley) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%joranbeasley)
-[@kianmeng](https://github.com/kianmeng) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pull/1290#issue-1906020324)
samukweku marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Changelog

## [Unreleased]

- [ENH] `select` function now supports variable arguments - PR #1288 @samukweku
## [v0.26.0] - 2023-09-18

- [ENH] `clean_names` can now be applied to column values. Issue #995 @samukweku
Expand Down
35 changes: 9 additions & 26 deletions janitor/functions/currency_column_to_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,16 @@ def currency_column_to_numeric(

column_series = df[column_name]
if cleaning_style == "accounting":
df.loc[:, column_name] = df[column_name].apply(
_clean_accounting_column
samukweku marked this conversation as resolved.
Show resolved Hide resolved
outcome = (
df[column_name]
.str.strip()
.str.replace(",", "", regex=False)
.str.replace(")", "", regex=False)
.str.replace("(", "-", regex=False)
.replace({"-": 0.0})
.astype(float)
)
return df
return df.assign(**{column_name: outcome})
if cleaning_style is not None:
raise ValueError(
"`cleaning_style` is expected to be one of ('accounting', None). "
Expand Down Expand Up @@ -130,29 +136,6 @@ def currency_column_to_numeric(
return df


def _clean_accounting_column(x: str) -> float:
"""Perform the logic for the "accounting" cleaning style.

This is a private function, not intended to be used outside of
`currency_column_to_numeric``.

It is intended to be used in a pandas `apply` method.

Args:
x: A string representing currency.

Returns:
A float representing currency.
"""
y = x.strip()
y = y.replace(",", "")
y = y.replace(")", "")
y = y.replace("(", "-")
if y == "-":
return 0.00
return float(y)


def _currency_column_to_numeric(
x: str,
cast_non_numeric: Optional[dict] = None,
Expand Down
19 changes: 8 additions & 11 deletions janitor/functions/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,31 +260,28 @@ def _date_filter_conditions(conditions):
"""Taken from: https://stackoverflow.com/a/13616382."""
return reduce(np.logical_and, conditions)

if column_date_options:
df.loc[:, column_name] = pd.to_datetime(
df.loc[:, column_name], **column_date_options
)
else:
df.loc[:, column_name] = pd.to_datetime(df.loc[:, column_name])
if column_date_options is None:
column_date_options = {}
df[column_name] = pd.to_datetime(df[column_name], **column_date_options)

_filter_list = []

if start_date:
start_date = pd.to_datetime(start_date, format=format)
_filter_list.append(df.loc[:, column_name] >= start_date)
_filter_list.append(df[column_name] >= start_date)

if end_date:
end_date = pd.to_datetime(end_date, format=format)
_filter_list.append(df.loc[:, column_name] <= end_date)
_filter_list.append(df[column_name] <= end_date)

if years:
_filter_list.append(df.loc[:, column_name].dt.year.isin(years))
_filter_list.append(df[column_name].dt.year.isin(years))

if months:
_filter_list.append(df.loc[:, column_name].dt.month.isin(months))
_filter_list.append(df[column_name].dt.month.isin(months))

if days:
_filter_list.append(df.loc[:, column_name].dt.day.isin(days))
_filter_list.append(df[column_name].dt.day.isin(days))

if start_date and end_date and start_date > end_date:
warnings.warn(
Expand Down
104 changes: 86 additions & 18 deletions janitor/functions/select.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
from typing import Any
import pandas_flavor as pf
import pandas as pd
from janitor.utils import deprecated_alias
from janitor.utils import refactored_function
from janitor.utils import check, deprecated_alias
from janitor.functions.utils import _select, DropLabel # noqa: F401


@pf.register_dataframe_method
@deprecated_alias(search_cols="search_column_names")
@refactored_function(
message=(
"This function will be deprecated in a 1.x release. "
"Please use `jn.select` instead."
)
)
def select_columns(
df: pd.DataFrame,
*args: Any,
Expand All @@ -30,6 +36,11 @@ def select_columns(
is with `.loc` or `.iloc` methods.
`select_columns` is primarily for convenience.

!!!note

This function will be deprecated in a 1.x release.
Please use `jn.select` instead.

Examples:
>>> import pandas as pd
>>> import janitor
Expand Down Expand Up @@ -209,19 +220,26 @@ class mammal
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection on a MultiIndex on different levels.
A dictionary can be used for selection
on a MultiIndex on different levels.
invert: Whether or not to invert the selection.
This will result in the selection of the complement of the columns
provided.
This will result in the selection
of the complement of the columns provided.

Returns:
A pandas DataFrame with the specified columns selected.
""" # noqa: E501

return _select(df, args=args, invert=invert, axis="columns")
return _select(df, columns=list(args), invert=invert)


@pf.register_dataframe_method
@refactored_function(
message=(
"This function will be deprecated in a 1.x release. "
"Please use `jn.select` instead."
)
)
def select_rows(
df: pd.DataFrame,
*args: Any,
Expand All @@ -242,13 +260,17 @@ def select_rows(

!!! info "New in version 0.24.0"


!!!note

The preferred option when selecting columns or rows in a Pandas DataFrame
is with `.loc` or `.iloc` methods, as they are generally performant.
`select_rows` is primarily for convenience.

!!!note

This function will be deprecated in a 1.x release.
Please use `jn.select` instead.

Examples:
>>> import pandas as pd
>>> import janitor
Expand All @@ -275,20 +297,27 @@ def select_rows(
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection on a MultiIndex on different levels.
A dictionary can be used for selection
on a MultiIndex on different levels.
invert: Whether or not to invert the selection.
This will result in the selection of the complement of the rows
provided.
This will result in the selection
of the complement of the rows provided.

Returns:
A pandas DataFrame with the specified rows selected.
""" # noqa: E501
return _select(df, args=args, invert=invert, axis="index")
return _select(df, rows=list(args), invert=invert)


@pf.register_dataframe_method
@deprecated_alias(rows="index")
def select(
df: pd.DataFrame, *, rows: Any = None, columns: Any = None
df: pd.DataFrame,
*args,
index: Any = None,
columns: Any = None,
axis: str = "columns",
invert: bool = False,
) -> pd.DataFrame:
"""Method-chainable selection of rows and columns.

Expand All @@ -302,6 +331,8 @@ def select(

Selection can be inverted with the `DropLabel` class.

Optional ability to invert selection of index/columns available as well.


!!! info "New in version 0.24.0"

Expand All @@ -312,6 +343,12 @@ def select(
is with `.loc` or `.iloc` methods, as they are generally performant.
`select` is primarily for convenience.

!!! abstract "Version Changed"

- 0.26.0
- Added variable `args`, `invert` and `axis` parameters.
samukweku marked this conversation as resolved.
Show resolved Hide resolved
- `rows` keyword deprecated in favour of `index`.

Examples:
>>> import pandas as pd
>>> import janitor
Expand All @@ -323,13 +360,13 @@ def select(
cobra 1 2
viper 4 5
sidewinder 7 8
>>> df.select(rows='cobra', columns='shield')
>>> df.select(index='cobra', columns='shield')
shield
cobra 2

Labels can be dropped with the `DropLabel` class:

>>> df.select(rows=DropLabel('cobra'))
>>> df.select(index=DropLabel('cobra'))
max_speed shield
viper 4 5
sidewinder 7 8
Expand All @@ -339,23 +376,54 @@ def select(

Args:
df: A pandas DataFrame.
rows: Valid inputs include: an exact label to look for,
*args: Valid inputs include: an exact index name to look for,
a shell-style glob string (e.g. `*_thing_*`),
a regular expression,
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection
on a MultiIndex on different levels.
index: Valid inputs include: an exact label to look for,
a shell-style glob string (e.g. `*_thing_*`),
a regular expression,
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection on a MultiIndex on different levels.
A dictionary can be used for selection
on a MultiIndex on different levels.
columns: Valid inputs include: an exact label to look for,
a shell-style glob string (e.g. `*_thing_*`),
a regular expression,
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection on a MultiIndex on different levels.
A dictionary can be used for selection
on a MultiIndex on different levels.
invert: Whether or not to invert the selection.
This will result in the selection
of the complement of the rows/columns provided.
axis: Whether the selection should be on the index('index'),
or columns('columns').
Applicable only for the variable args parameter.

Raises:
ValueError: If args and index/columns are provided.

Returns:
A pandas DataFrame with the specified rows and/or columns selected.
""" # noqa: E501

return _select(df, args=None, rows=rows, columns=columns, axis="both")
if args:
check("invert", invert, [bool])
if (index is not None) or (columns is not None):
raise ValueError(
"Either provide variable args with the axis parameter, "
"or provide arguments to the index and/or columns parameters."
)
if axis == "index":
return _select(df, rows=list(args), columns=columns, invert=invert)
if axis == "columns":
return _select(df, columns=list(args), rows=index, invert=invert)
raise ValueError("axis should be either 'index' or 'columns'.")
return _select(df, rows=index, columns=columns, invert=invert)
34 changes: 17 additions & 17 deletions janitor/functions/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,9 +642,7 @@ def get_columns(group: Union[DataFrameGroupBy, SeriesGroupBy], label):

def _select(
df: pd.DataFrame,
args: tuple,
invert: bool = False,
axis: str = "index",
rows=None,
columns=None,
) -> pd.DataFrame:
Expand All @@ -653,23 +651,25 @@ def _select(

Returns a DataFrame.
"""
assert axis in {"both", "index", "columns"}
if axis == "both":
if rows is None:
rows = slice(None)
if rows is None:
row_indexer = slice(None)
else:
outcome = _select_index([rows], df, axis="index")
if invert:
row_indexer = np.ones(df.index.size, dtype=np.bool_)
row_indexer[outcome] = False
else:
rows = _select_index([rows], df, axis="index")
if columns is None:
columns = slice(None)
row_indexer = outcome
if columns is None:
column_indexer = slice(None)
else:
outcome = _select_index([columns], df, axis="columns")
if invert:
column_indexer = np.ones(df.columns.size, dtype=np.bool_)
column_indexer[outcome] = False
else:
columns = _select_index([columns], df, axis="columns")
return df.iloc[rows, columns]
indices = _select_index(list(args), df, axis)
if invert:
rev = np.ones(getattr(df, axis).size, dtype=np.bool_)
rev[indices] = False
return df.iloc(axis=axis)[rev]
return df.iloc(axis=axis)[indices]
column_indexer = outcome
return df.iloc[row_indexer, column_indexer]


class _JoinOperator(Enum):
Expand Down
Loading
Loading