Skip to content

Commit

Permalink
Merge branch 'dev' into samukweku/complete_fillna
Browse files Browse the repository at this point in the history
  • Loading branch information
ericmjl authored Oct 2, 2023
2 parents 1ce0221 + 3f4a034 commit 7290097
Show file tree
Hide file tree
Showing 11 changed files with 243 additions and 132 deletions.
1 change: 1 addition & 0 deletions AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,4 @@ Contributors
- [@asmirnov69](https://github.com/asmirnov69) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%asmirnov69)
- [@xujiboy](https://github.com/xujiboy) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%xujiboy)
- [@joranbeasley](https://github.com/joranbeasley) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%joranbeasley)
-[@kianmeng](https://github.com/kianmeng) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pull/1290#issue-1906020324)
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# Changelog

## [Unreleased]

- [BUG] Fix logic for groupby in complete. Index support deprecated. Fix deprecation warning for fillna in `complete` PR #1289 @samukweku
- [ENH] `select` function now supports variable arguments - PR #1288 @samukweku

## [v0.26.0] - 2023-09-18

Expand Down
35 changes: 9 additions & 26 deletions janitor/functions/currency_column_to_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,16 @@ def currency_column_to_numeric(

column_series = df[column_name]
if cleaning_style == "accounting":
df.loc[:, column_name] = df[column_name].apply(
_clean_accounting_column
outcome = (
df[column_name]
.str.strip()
.str.replace(",", "", regex=False)
.str.replace(")", "", regex=False)
.str.replace("(", "-", regex=False)
.replace({"-": 0.0})
.astype(float)
)
return df
return df.assign(**{column_name: outcome})
if cleaning_style is not None:
raise ValueError(
"`cleaning_style` is expected to be one of ('accounting', None). "
Expand Down Expand Up @@ -130,29 +136,6 @@ def currency_column_to_numeric(
return df


def _clean_accounting_column(x: str) -> float:
"""Perform the logic for the "accounting" cleaning style.
This is a private function, not intended to be used outside of
`currency_column_to_numeric``.
It is intended to be used in a pandas `apply` method.
Args:
x: A string representing currency.
Returns:
A float representing currency.
"""
y = x.strip()
y = y.replace(",", "")
y = y.replace(")", "")
y = y.replace("(", "-")
if y == "-":
return 0.00
return float(y)


def _currency_column_to_numeric(
x: str,
cast_non_numeric: Optional[dict] = None,
Expand Down
19 changes: 8 additions & 11 deletions janitor/functions/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,31 +260,28 @@ def _date_filter_conditions(conditions):
"""Taken from: https://stackoverflow.com/a/13616382."""
return reduce(np.logical_and, conditions)

if column_date_options:
df.loc[:, column_name] = pd.to_datetime(
df.loc[:, column_name], **column_date_options
)
else:
df.loc[:, column_name] = pd.to_datetime(df.loc[:, column_name])
if column_date_options is None:
column_date_options = {}
df[column_name] = pd.to_datetime(df[column_name], **column_date_options)

_filter_list = []

if start_date:
start_date = pd.to_datetime(start_date, format=format)
_filter_list.append(df.loc[:, column_name] >= start_date)
_filter_list.append(df[column_name] >= start_date)

if end_date:
end_date = pd.to_datetime(end_date, format=format)
_filter_list.append(df.loc[:, column_name] <= end_date)
_filter_list.append(df[column_name] <= end_date)

if years:
_filter_list.append(df.loc[:, column_name].dt.year.isin(years))
_filter_list.append(df[column_name].dt.year.isin(years))

if months:
_filter_list.append(df.loc[:, column_name].dt.month.isin(months))
_filter_list.append(df[column_name].dt.month.isin(months))

if days:
_filter_list.append(df.loc[:, column_name].dt.day.isin(days))
_filter_list.append(df[column_name].dt.day.isin(days))

if start_date and end_date and start_date > end_date:
warnings.warn(
Expand Down
104 changes: 86 additions & 18 deletions janitor/functions/select.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
from typing import Any
import pandas_flavor as pf
import pandas as pd
from janitor.utils import deprecated_alias
from janitor.utils import refactored_function
from janitor.utils import check, deprecated_alias
from janitor.functions.utils import _select, DropLabel # noqa: F401


@pf.register_dataframe_method
@deprecated_alias(search_cols="search_column_names")
@refactored_function(
message=(
"This function will be deprecated in a 1.x release. "
"Please use `jn.select` instead."
)
)
def select_columns(
df: pd.DataFrame,
*args: Any,
Expand All @@ -30,6 +36,11 @@ def select_columns(
is with `.loc` or `.iloc` methods.
`select_columns` is primarily for convenience.
!!!note
This function will be deprecated in a 1.x release.
Please use `jn.select` instead.
Examples:
>>> import pandas as pd
>>> import janitor
Expand Down Expand Up @@ -209,19 +220,26 @@ class mammal
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection on a MultiIndex on different levels.
A dictionary can be used for selection
on a MultiIndex on different levels.
invert: Whether or not to invert the selection.
This will result in the selection of the complement of the columns
provided.
This will result in the selection
of the complement of the columns provided.
Returns:
A pandas DataFrame with the specified columns selected.
""" # noqa: E501

return _select(df, args=args, invert=invert, axis="columns")
return _select(df, columns=list(args), invert=invert)


@pf.register_dataframe_method
@refactored_function(
message=(
"This function will be deprecated in a 1.x release. "
"Please use `jn.select` instead."
)
)
def select_rows(
df: pd.DataFrame,
*args: Any,
Expand All @@ -242,13 +260,17 @@ def select_rows(
!!! info "New in version 0.24.0"
!!!note
The preferred option when selecting columns or rows in a Pandas DataFrame
is with `.loc` or `.iloc` methods, as they are generally performant.
`select_rows` is primarily for convenience.
!!!note
This function will be deprecated in a 1.x release.
Please use `jn.select` instead.
Examples:
>>> import pandas as pd
>>> import janitor
Expand All @@ -275,20 +297,27 @@ def select_rows(
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection on a MultiIndex on different levels.
A dictionary can be used for selection
on a MultiIndex on different levels.
invert: Whether or not to invert the selection.
This will result in the selection of the complement of the rows
provided.
This will result in the selection
of the complement of the rows provided.
Returns:
A pandas DataFrame with the specified rows selected.
""" # noqa: E501
return _select(df, args=args, invert=invert, axis="index")
return _select(df, rows=list(args), invert=invert)


@pf.register_dataframe_method
@deprecated_alias(rows="index")
def select(
df: pd.DataFrame, *, rows: Any = None, columns: Any = None
df: pd.DataFrame,
*args,
index: Any = None,
columns: Any = None,
axis: str = "columns",
invert: bool = False,
) -> pd.DataFrame:
"""Method-chainable selection of rows and columns.
Expand All @@ -302,6 +331,8 @@ def select(
Selection can be inverted with the `DropLabel` class.
Optional ability to invert selection of index/columns available as well.
!!! info "New in version 0.24.0"
Expand All @@ -312,6 +343,12 @@ def select(
is with `.loc` or `.iloc` methods, as they are generally performant.
`select` is primarily for convenience.
!!! abstract "Version Changed"
- 0.26.0
- Added variable `args`, `invert` and `axis` parameters.
- `rows` keyword deprecated in favour of `index`.
Examples:
>>> import pandas as pd
>>> import janitor
Expand All @@ -323,13 +360,13 @@ def select(
cobra 1 2
viper 4 5
sidewinder 7 8
>>> df.select(rows='cobra', columns='shield')
>>> df.select(index='cobra', columns='shield')
shield
cobra 2
Labels can be dropped with the `DropLabel` class:
>>> df.select(rows=DropLabel('cobra'))
>>> df.select(index=DropLabel('cobra'))
max_speed shield
viper 4 5
sidewinder 7 8
Expand All @@ -339,23 +376,54 @@ def select(
Args:
df: A pandas DataFrame.
rows: Valid inputs include: an exact label to look for,
*args: Valid inputs include: an exact index name to look for,
a shell-style glob string (e.g. `*_thing_*`),
a regular expression,
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection
on a MultiIndex on different levels.
index: Valid inputs include: an exact label to look for,
a shell-style glob string (e.g. `*_thing_*`),
a regular expression,
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection on a MultiIndex on different levels.
A dictionary can be used for selection
on a MultiIndex on different levels.
columns: Valid inputs include: an exact label to look for,
a shell-style glob string (e.g. `*_thing_*`),
a regular expression,
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection on a MultiIndex on different levels.
A dictionary can be used for selection
on a MultiIndex on different levels.
invert: Whether or not to invert the selection.
This will result in the selection
of the complement of the rows/columns provided.
axis: Whether the selection should be on the index('index'),
or columns('columns').
Applicable only for the variable args parameter.
Raises:
ValueError: If args and index/columns are provided.
Returns:
A pandas DataFrame with the specified rows and/or columns selected.
""" # noqa: E501

return _select(df, args=None, rows=rows, columns=columns, axis="both")
if args:
check("invert", invert, [bool])
if (index is not None) or (columns is not None):
raise ValueError(
"Either provide variable args with the axis parameter, "
"or provide arguments to the index and/or columns parameters."
)
if axis == "index":
return _select(df, rows=list(args), columns=columns, invert=invert)
if axis == "columns":
return _select(df, columns=list(args), rows=index, invert=invert)
raise ValueError("axis should be either 'index' or 'columns'.")
return _select(df, rows=index, columns=columns, invert=invert)
34 changes: 17 additions & 17 deletions janitor/functions/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,9 +642,7 @@ def get_columns(group: Union[DataFrameGroupBy, SeriesGroupBy], label):

def _select(
df: pd.DataFrame,
args: tuple,
invert: bool = False,
axis: str = "index",
rows=None,
columns=None,
) -> pd.DataFrame:
Expand All @@ -653,23 +651,25 @@ def _select(
Returns a DataFrame.
"""
assert axis in {"both", "index", "columns"}
if axis == "both":
if rows is None:
rows = slice(None)
if rows is None:
row_indexer = slice(None)
else:
outcome = _select_index([rows], df, axis="index")
if invert:
row_indexer = np.ones(df.index.size, dtype=np.bool_)
row_indexer[outcome] = False
else:
rows = _select_index([rows], df, axis="index")
if columns is None:
columns = slice(None)
row_indexer = outcome
if columns is None:
column_indexer = slice(None)
else:
outcome = _select_index([columns], df, axis="columns")
if invert:
column_indexer = np.ones(df.columns.size, dtype=np.bool_)
column_indexer[outcome] = False
else:
columns = _select_index([columns], df, axis="columns")
return df.iloc[rows, columns]
indices = _select_index(list(args), df, axis)
if invert:
rev = np.ones(getattr(df, axis).size, dtype=np.bool_)
rev[indices] = False
return df.iloc(axis=axis)[rev]
return df.iloc(axis=axis)[indices]
column_indexer = outcome
return df.iloc[row_indexer, column_indexer]


class _JoinOperator(Enum):
Expand Down
Loading

0 comments on commit 7290097

Please sign in to comment.