Data match filter (#139)

* #138 * Update data_match.Rd * fix check issues, docs * typo * wording * better description * docs * data_filter is short-cut, improve see-also * Update test-labelled_data.R * data match matches, data filter filters * Update test-labelled_data.R
easystats · Mar 23, 2022 · 23a1301 · 23a1301
1 parent c402931
commit 23a1301
Show file tree

Hide file tree

Showing 15 changed files with 191 additions and 35 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -117,6 +117,7 @@ export(data_addsuffix)
 export(data_adjust)
 export(data_cut)
 export(data_extract)
+export(data_filter)
 export(data_findcols)
 export(data_join)
 export(data_match)

diff --git a/NEWS.md b/NEWS.md
@@ -25,6 +25,9 @@ NEW FUNCTIONS
     select-helpers). These function will supersede `data_findcols()` in the
     future.
 
+* `data_filter()` as complement for `data_match()`, which works with logical 
+  expressions for filtering rows of data frames.
+
   * For computing weighted centrality measures and dispersion: `weighted_mean()`,
     `weighted_median()`, `weighted_sd()` and `weighted_mad()`.
 

diff --git a/R/data_match.R b/R/data_match.R
@@ -1,34 +1,77 @@
-#' Find row indices of a data frame matching a specific condition
+#' Return filtered data frame or row indices
 #'
-#' Find row indices of a data frame that match a specific condition.
+#' Return a filtered data frame or row indices of a data frame that match a
+#' specific condition. `data_filter()` works like `data_match()`, but works
+#' with logical expressions instead of data frame to specify matching conditions.
 #'
 #' @param x A data frame.
-#' @param to A data frame matching the specified conditions.
+#' @param to A data frame matching the specified conditions. Note that if
+#'   `match` is a value other than `"and"`, the original row order might be
+#'   changed. See 'Details'.
+#' @param filter A logical expression indicating which rows to keep.
 #' @param match String, indicating with which logical operation matching
 #'   conditions should be combined. Can be `"and"` (or `"&"`), `"or"` (or `"|"`)
 #'   or `"not"` (or `"!"`).
-#' @param return_indices Logical, if `FALSE`, return the vector of rows that can be used to filter the original data frame. If `FALSE` (default), returns directly the filtered data frame
-#'   instead of the row indices.
+#' @param return_indices Logical, if `FALSE`, return the vector of rows that
+#'   can be used to filter the original data frame. If `FALSE` (default),
+#'   returns directly the filtered data frame instead of the row indices.
 #' @param ... Not used.
 #'
-#' @return The row indices that match the specified configuration.
+#' @return A filtered data frame, or the row indices that match the specified configuration.
+#'
+#' @details For `data_match()`, if `match` is either `"or"` or `"not"`, the
+#' original row order from `x` might be changed. If preserving row order is
+#' required, use `data_filter()` instead.
+#'
+#' ```
+#' # mimics subset() behaviour, preserving original row order
+#' head(data_filter(mtcars[c("mpg", "vs", "am")], vs == 0 | am == 1))
+#' #>                    mpg vs am
+#' #> Mazda RX4         21.0  0  1
+#' #> Mazda RX4 Wag     21.0  0  1
+#' #> Datsun 710        22.8  1  1
+#' #> Hornet Sportabout 18.7  0  0
+#' #> Duster 360        14.3  0  0
+#' #> Merc 450SE        16.4  0  0
+#'
+#' # re-sorting rows
+#' head(data_match(mtcars[c("mpg", "vs", "am")],
+#'                 data.frame(vs = 0, am = 1),
+#'                 match = "or"))
+#' #>                    mpg vs am
+#' #> Mazda RX4         21.0  0  1
+#' #> Mazda RX4 Wag     21.0  0  1
+#' #> Hornet Sportabout 18.7  0  0
+#' #> Duster 360        14.3  0  0
+#' #> Merc 450SE        16.4  0  0
+#' #> Merc 450SL        17.3  0  0
+#' ```
+#'
+#' While `data_match()` works with data frames to match conditions against,
+#' `data_filter()` is basically a wrapper around `subset(subset = <filter>)`.
+#' However, unlike `subset()`, it preserves label attributes and is useful when
+#' working with labelled data.
 #'
 #' @examples
 #' data_match(mtcars, data.frame(vs = 0, am = 1))
 #' data_match(mtcars, data.frame(vs = 0, am = c(0, 1)))
 #'
 #' # observations where "vs" is NOT 0 AND "am" is NOT 1
 #' data_match(mtcars, data.frame(vs = 0, am = 1), match = "not")
+#' # equivalent to
+#' data_filter(mtcars, vs != 0 & am != 1)
 #'
 #' # observations where EITHER "vs" is 0 OR "am" is 1
 #' data_match(mtcars, data.frame(vs = 0, am = 1), match = "or")
+#' # equivalent to
+#' data_filter(mtcars, vs == 0 | am == 1)
 #'
 #' @inherit data_rename seealso
 #' @export
 data_match <- function(x, to, match = "and", return_indices = FALSE, ...) {
-
-  # Input checks
-  if (!is.data.frame(to)) to <- as.data.frame(to)
+  if (!is.data.frame(to)) {
+    to <- as.data.frame(to)
+  }
   original_x <- x
 
   # evaluate
@@ -86,3 +129,18 @@ data_match <- function(x, to, match = "and", return_indices = FALSE, ...) {
 
   out
 }
+
+
+
+#' @rdname data_match
+#' @export
+data_filter <- function(x, filter, ...) {
+  condition <- substitute(filter)
+  out <- do.call(subset, list(x, subset = condition))
+  # restore value and variable labels
+  for (i in colnames(out)) {
+    attr(out[[i]], "label") <- attr(x[[i]], "label", exact = TRUE)
+    attr(out[[i]], "labels") <- attr(x[[i]], "labels", exact = TRUE)
+  }
+  out
+}
diff --git a/R/data_rename.R b/R/data_rename.R
@@ -38,8 +38,9 @@
 #' - Functions to reshape, pivot or rotate dataframes: [data_to_long()], [data_to_wide()], [data_rotate()]
 #' - Functions to rescale and reverse: [data_rescale()], [data_reverse()]
 #' - Functions to standardize, normalize, rank-transform: [standardize()], [normalize()], [ranktransform()], [winsorize()]
-#' - Split, cut and merge dataframes: [data_partition()], [data_cut()], [data_match()], [data_merge()]
-#' - Functions to find columns: [find_columns()]
+#' - Split, cut and merge dataframes: [data_partition()], [data_cut()], [data_merge()]
+#' - Functions to find or select columns: [find_columns()]
+#' - Functions to filter rows: [data_match()], [data_filter()]
 #'
 #' @export
 data_rename <- function(data, pattern = NULL, replacement = NULL, safe = TRUE, ...) {

diff --git a/man/data_match.Rd b/man/data_match.Rd
diff --git a/man/data_merge.Rd b/man/data_merge.Rd
diff --git a/man/data_partition.Rd b/man/data_partition.Rd
diff --git a/man/data_relocate.Rd b/man/data_relocate.Rd
diff --git a/man/data_rename.Rd b/man/data_rename.Rd
diff --git a/man/data_rotate.Rd b/man/data_rotate.Rd
diff --git a/man/data_to_long.Rd b/man/data_to_long.Rd
diff --git a/man/find_columns.Rd b/man/find_columns.Rd