Merge branch 'dev' into samukweku/complete_fillna

pyjanitor-devs · Oct 2, 2023 · 7290097 · 7290097
2 parents 1ce0221 + 3f4a034
commit 7290097
Show file tree

Hide file tree

Showing 11 changed files with 243 additions and 132 deletions.
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -112,3 +112,4 @@ Contributors
 - [@asmirnov69](https://github.com/asmirnov69) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%asmirnov69)
 - [@xujiboy](https://github.com/xujiboy) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%xujiboy)
 - [@joranbeasley](https://github.com/joranbeasley) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%joranbeasley)
+-[@kianmeng](https://github.com/kianmeng) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pull/1290#issue-1906020324)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,9 @@
 # Changelog
 
 ## [Unreleased]
+
 -   [BUG] Fix logic for groupby in complete. Index support deprecated. Fix deprecation warning for fillna in `complete` PR #1289 @samukweku
+-   [ENH] `select` function now supports variable arguments - PR #1288 @samukweku
 
 ## [v0.26.0] - 2023-09-18
 

diff --git a/janitor/functions/currency_column_to_numeric.py b/janitor/functions/currency_column_to_numeric.py
@@ -92,10 +92,16 @@ def currency_column_to_numeric(
 
     column_series = df[column_name]
     if cleaning_style == "accounting":
-        df.loc[:, column_name] = df[column_name].apply(
-            _clean_accounting_column
+        outcome = (
+            df[column_name]
+            .str.strip()
+            .str.replace(",", "", regex=False)
+            .str.replace(")", "", regex=False)
+            .str.replace("(", "-", regex=False)
+            .replace({"-": 0.0})
+            .astype(float)
         )
-        return df
+        return df.assign(**{column_name: outcome})
     if cleaning_style is not None:
         raise ValueError(
             "`cleaning_style` is expected to be one of ('accounting', None). "
@@ -130,29 +136,6 @@ def currency_column_to_numeric(
     return df
 
 
-def _clean_accounting_column(x: str) -> float:
-    """Perform the logic for the "accounting" cleaning style.
-
-    This is a private function, not intended to be used outside of
-    `currency_column_to_numeric``.
-
-    It is intended to be used in a pandas `apply` method.
-
-    Args:
-        x: A string representing currency.
-
-    Returns:
-        A float representing currency.
-    """
-    y = x.strip()
-    y = y.replace(",", "")
-    y = y.replace(")", "")
-    y = y.replace("(", "-")
-    if y == "-":
-        return 0.00
-    return float(y)
-
-
 def _currency_column_to_numeric(
     x: str,
     cast_non_numeric: Optional[dict] = None,

diff --git a/janitor/functions/filter.py b/janitor/functions/filter.py
@@ -260,31 +260,28 @@ def _date_filter_conditions(conditions):
         """Taken from: https://stackoverflow.com/a/13616382."""
         return reduce(np.logical_and, conditions)
 
-    if column_date_options:
-        df.loc[:, column_name] = pd.to_datetime(
-            df.loc[:, column_name], **column_date_options
-        )
-    else:
-        df.loc[:, column_name] = pd.to_datetime(df.loc[:, column_name])
+    if column_date_options is None:
+        column_date_options = {}
+    df[column_name] = pd.to_datetime(df[column_name], **column_date_options)
 
     _filter_list = []
 
     if start_date:
         start_date = pd.to_datetime(start_date, format=format)
-        _filter_list.append(df.loc[:, column_name] >= start_date)
+        _filter_list.append(df[column_name] >= start_date)
 
     if end_date:
         end_date = pd.to_datetime(end_date, format=format)
-        _filter_list.append(df.loc[:, column_name] <= end_date)
+        _filter_list.append(df[column_name] <= end_date)
 
     if years:
-        _filter_list.append(df.loc[:, column_name].dt.year.isin(years))
+        _filter_list.append(df[column_name].dt.year.isin(years))
 
     if months:
-        _filter_list.append(df.loc[:, column_name].dt.month.isin(months))
+        _filter_list.append(df[column_name].dt.month.isin(months))
 
     if days:
-        _filter_list.append(df.loc[:, column_name].dt.day.isin(days))
+        _filter_list.append(df[column_name].dt.day.isin(days))
 
     if start_date and end_date and start_date > end_date:
         warnings.warn(

diff --git a/janitor/functions/select.py b/janitor/functions/select.py
@@ -1,12 +1,18 @@
 from typing import Any
 import pandas_flavor as pf
 import pandas as pd
-from janitor.utils import deprecated_alias
+from janitor.utils import refactored_function
+from janitor.utils import check, deprecated_alias
 from janitor.functions.utils import _select, DropLabel  # noqa: F401
 
 
 @pf.register_dataframe_method
-@deprecated_alias(search_cols="search_column_names")
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `jn.select` instead."
+    )
+)
 def select_columns(
     df: pd.DataFrame,
     *args: Any,
@@ -30,6 +36,11 @@ def select_columns(
         is with `.loc` or `.iloc` methods.
         `select_columns` is primarily for convenience.
 
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `jn.select` instead.
+
     Examples:
         >>> import pandas as pd
         >>> import janitor
@@ -209,19 +220,26 @@ class      mammal
             a callable,
             or variable arguments of all the aforementioned.
             A sequence of booleans is also acceptable.
-            A dictionary can be used for selection on a MultiIndex on different levels.
+            A dictionary can be used for selection
+            on a MultiIndex on different levels.
         invert: Whether or not to invert the selection.
-            This will result in the selection of the complement of the columns
-            provided.
+            This will result in the selection
+            of the complement of the columns provided.
 
     Returns:
         A pandas DataFrame with the specified columns selected.
     """  # noqa: E501
 
-    return _select(df, args=args, invert=invert, axis="columns")
+    return _select(df, columns=list(args), invert=invert)
 
 
 @pf.register_dataframe_method
+@refactored_function(
+    message=(
+        "This function will be deprecated in a 1.x release. "
+        "Please use `jn.select` instead."
+    )
+)
 def select_rows(
     df: pd.DataFrame,
     *args: Any,
@@ -242,13 +260,17 @@ def select_rows(
 
     !!! info "New in version 0.24.0"
 
-
     !!!note
 
         The preferred option when selecting columns or rows in a Pandas DataFrame
         is with `.loc` or `.iloc` methods, as they are generally performant.
         `select_rows` is primarily for convenience.
 
+    !!!note
+
+        This function will be deprecated in a 1.x release.
+        Please use `jn.select` instead.
+
     Examples:
         >>> import pandas as pd
         >>> import janitor
@@ -275,20 +297,27 @@ def select_rows(
             a callable,
             or variable arguments of all the aforementioned.
             A sequence of booleans is also acceptable.
-            A dictionary can be used for selection on a MultiIndex on different levels.
+            A dictionary can be used for selection
+            on a MultiIndex on different levels.
         invert: Whether or not to invert the selection.
-            This will result in the selection of the complement of the rows
-            provided.
+            This will result in the selection
+            of the complement of the rows provided.
 
     Returns:
         A pandas DataFrame with the specified rows selected.
     """  # noqa: E501
-    return _select(df, args=args, invert=invert, axis="index")
+    return _select(df, rows=list(args), invert=invert)
 
 
 @pf.register_dataframe_method
+@deprecated_alias(rows="index")
 def select(
-    df: pd.DataFrame, *, rows: Any = None, columns: Any = None
+    df: pd.DataFrame,
+    *args,
+    index: Any = None,
+    columns: Any = None,
+    axis: str = "columns",
+    invert: bool = False,
 ) -> pd.DataFrame:
     """Method-chainable selection of rows and columns.
 
@@ -302,6 +331,8 @@ def select(
 
     Selection can be inverted with the `DropLabel` class.
 
+    Optional ability to invert selection of index/columns available as well.
+
 
     !!! info "New in version 0.24.0"
 
@@ -312,6 +343,12 @@ def select(
         is with `.loc` or `.iloc` methods, as they are generally performant.
         `select` is primarily for convenience.
 
+    !!! abstract "Version Changed"
+
+        - 0.26.0
+            - Added variable `args`, `invert` and `axis` parameters.
+            - `rows` keyword deprecated in favour of `index`.
+
     Examples:
         >>> import pandas as pd
         >>> import janitor
@@ -323,13 +360,13 @@ def select(
         cobra               1       2
         viper               4       5
         sidewinder          7       8
-        >>> df.select(rows='cobra', columns='shield')
+        >>> df.select(index='cobra', columns='shield')
                shield
         cobra       2
 
         Labels can be dropped with the `DropLabel` class:
 
-        >>> df.select(rows=DropLabel('cobra'))
+        >>> df.select(index=DropLabel('cobra'))
                     max_speed  shield
         viper               4       5
         sidewinder          7       8
@@ -339,23 +376,54 @@ def select(
 
     Args:
         df: A pandas DataFrame.
-        rows: Valid inputs include: an exact label to look for,
+        *args: Valid inputs include: an exact index name to look for,
+            a shell-style glob string (e.g. `*_thing_*`),
+            a regular expression,
+            a callable,
+            or variable arguments of all the aforementioned.
+            A sequence of booleans is also acceptable.
+            A dictionary can be used for selection
+            on a MultiIndex on different levels.
+        index: Valid inputs include: an exact label to look for,
             a shell-style glob string (e.g. `*_thing_*`),
             a regular expression,
             a callable,
             or variable arguments of all the aforementioned.
             A sequence of booleans is also acceptable.
-            A dictionary can be used for selection on a MultiIndex on different levels.
+            A dictionary can be used for selection
+            on a MultiIndex on different levels.
         columns: Valid inputs include: an exact label to look for,
             a shell-style glob string (e.g. `*_thing_*`),
             a regular expression,
             a callable,
             or variable arguments of all the aforementioned.
             A sequence of booleans is also acceptable.
-            A dictionary can be used for selection on a MultiIndex on different levels.
+            A dictionary can be used for selection
+            on a MultiIndex on different levels.
+        invert: Whether or not to invert the selection.
+            This will result in the selection
+            of the complement of the rows/columns provided.
+        axis: Whether the selection should be on the index('index'),
+            or columns('columns').
+            Applicable only for the variable args parameter.
+
+    Raises:
+        ValueError: If args and index/columns are provided.
 
     Returns:
         A pandas DataFrame with the specified rows and/or columns selected.
     """  # noqa: E501
 
-    return _select(df, args=None, rows=rows, columns=columns, axis="both")
+    if args:
+        check("invert", invert, [bool])
+        if (index is not None) or (columns is not None):
+            raise ValueError(
+                "Either provide variable args with the axis parameter, "
+                "or provide arguments to the index and/or columns parameters."
+            )
+        if axis == "index":
+            return _select(df, rows=list(args), columns=columns, invert=invert)
+        if axis == "columns":
+            return _select(df, columns=list(args), rows=index, invert=invert)
+        raise ValueError("axis should be either 'index' or 'columns'.")
+    return _select(df, rows=index, columns=columns, invert=invert)
diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py
@@ -642,9 +642,7 @@ def get_columns(group: Union[DataFrameGroupBy, SeriesGroupBy], label):
 
 def _select(
     df: pd.DataFrame,
-    args: tuple,
     invert: bool = False,
-    axis: str = "index",
     rows=None,
     columns=None,
 ) -> pd.DataFrame:
@@ -653,23 +651,25 @@ def _select(
 
     Returns a DataFrame.
     """
-    assert axis in {"both", "index", "columns"}
-    if axis == "both":
-        if rows is None:
-            rows = slice(None)
+    if rows is None:
+        row_indexer = slice(None)
+    else:
+        outcome = _select_index([rows], df, axis="index")
+        if invert:
+            row_indexer = np.ones(df.index.size, dtype=np.bool_)
+            row_indexer[outcome] = False
         else:
-            rows = _select_index([rows], df, axis="index")
-        if columns is None:
-            columns = slice(None)
+            row_indexer = outcome
+    if columns is None:
+        column_indexer = slice(None)
+    else:
+        outcome = _select_index([columns], df, axis="columns")
+        if invert:
+            column_indexer = np.ones(df.columns.size, dtype=np.bool_)
+            column_indexer[outcome] = False
         else:
-            columns = _select_index([columns], df, axis="columns")
-        return df.iloc[rows, columns]
-    indices = _select_index(list(args), df, axis)
-    if invert:
-        rev = np.ones(getattr(df, axis).size, dtype=np.bool_)
-        rev[indices] = False
-        return df.iloc(axis=axis)[rev]
-    return df.iloc(axis=axis)[indices]
+            column_indexer = outcome
+    return df.iloc[row_indexer, column_indexer]
 
 
 class _JoinOperator(Enum):