From 706d1e9557888f5d2a930001fa21eff1e408f781 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 5 Jun 2024 23:45:32 +0100 Subject: [PATCH] Implement `$has_nulls()` (#1133) --- NEWS.md | 4 + R/expr__expr.R | 16 ++ R/expr__name.R | 2 +- man/Expr_has_nulls.Rd | 23 ++ tests/testthat/_snaps/after-wrappers.md | 324 ++++++++++++------------ tests/testthat/test-expr_expr.R | 13 + 6 files changed, 220 insertions(+), 162 deletions(-) create mode 100644 man/Expr_has_nulls.Rd diff --git a/NEWS.md b/NEWS.md index 9e57a2519..e886f5f24 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,10 @@ ## Polars R Package (development version) +### New features + +- New method `$has_nulls()` (#1133). + ## Polars R Package 0.17.0 ### Breaking changes diff --git a/R/expr__expr.R b/R/expr__expr.R index 129beaa90..e3a266e5c 100644 --- a/R/expr__expr.R +++ b/R/expr__expr.R @@ -3742,3 +3742,19 @@ Expr_qcut = function( unwrap("in $qcut():") } } + +#' Check whether the expression contains one or more null values +#' +#' @return Expr +#' +#' @examples +#' df = pl$DataFrame( +#' a = c(NA, 1, NA), +#' b = c(1, NA, 2), +#' c = c(1, 2, 3) +#' ) +#' +#' df$select(pl$all()$has_nulls()) +Expr_has_nulls = function() { + self$null_count() > 0 +} diff --git a/R/expr__name.R b/R/expr__name.R index a540492e4..5e7c53a26 100644 --- a/R/expr__name.R +++ b/R/expr__name.R @@ -50,7 +50,6 @@ ExprName_keep = function() { } # TODO: this method is broken after , so not documented -#' @noRd #' Map alias of expression with an R function #' #' Rename the output of an expression by mapping a function over the root name. @@ -69,6 +68,7 @@ ExprName_keep = function() { #' df$select( #' pl$col("var1")$alias("foobar")$name$map(\(x) paste0("new_", x)) #' ) +#' @noRd ExprName_map = function(fun) { if ( !polars_options()$no_messages && diff --git a/man/Expr_has_nulls.Rd b/man/Expr_has_nulls.Rd new file mode 100644 index 000000000..a10dcb291 --- /dev/null +++ b/man/Expr_has_nulls.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/expr__expr.R +\name{Expr_has_nulls} +\alias{Expr_has_nulls} +\title{Check whether the expression contains one or more null values} +\usage{ +Expr_has_nulls() +} +\value{ +Expr +} +\description{ +Check whether the expression contains one or more null values +} +\examples{ +df = pl$DataFrame( + a = c(NA, 1, NA), + b = c(1, NA, 2), + c = c(1, 2, 3) +) + +df$select(pl$all()$has_nulls()) +} diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index e6ca0d1a8..21e66fee0 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -236,46 +236,46 @@ [58] "filter" "first" "flatten" [61] "floor" "floor_div" "forward_fill" [64] "gather" "gather_every" "gt" - [67] "gt_eq" "hash" "head" - [70] "implode" "inspect" "interpolate" - [73] "is_between" "is_duplicated" "is_finite" - [76] "is_first_distinct" "is_in" "is_infinite" - [79] "is_last_distinct" "is_nan" "is_not_nan" - [82] "is_not_null" "is_null" "is_unique" - [85] "kurtosis" "last" "len" - [88] "limit" "list" "log" - [91] "log10" "lower_bound" "lt" - [94] "lt_eq" "map_batches" "map_elements" - [97] "max" "mean" "median" - [100] "meta" "min" "mod" - [103] "mode" "mul" "n_unique" - [106] "name" "nan_max" "nan_min" - [109] "neq" "neq_missing" "not" - [112] "null_count" "or" "over" - [115] "pct_change" "peak_max" "peak_min" - [118] "pow" "print" "product" - [121] "qcut" "quantile" "rank" - [124] "rechunk" "reinterpret" "rep" - [127] "repeat_by" "replace" "reshape" - [130] "reverse" "rle" "rle_id" - [133] "rolling" "rolling_max" "rolling_max_by" - [136] "rolling_mean" "rolling_mean_by" "rolling_median" - [139] "rolling_median_by" "rolling_min" "rolling_min_by" - [142] "rolling_quantile" "rolling_quantile_by" "rolling_skew" - [145] "rolling_std" "rolling_std_by" "rolling_sum" - [148] "rolling_sum_by" "rolling_var" "rolling_var_by" - [151] "round" "sample" "search_sorted" - [154] "set_sorted" "shift" "shift_and_fill" - [157] "shrink_dtype" "shuffle" "sign" - [160] "sin" "sinh" "skew" - [163] "slice" "sort" "sort_by" - [166] "sqrt" "std" "str" - [169] "struct" "sub" "sum" - [172] "tail" "tan" "tanh" - [175] "to_physical" "to_r" "to_series" - [178] "top_k" "unique" "unique_counts" - [181] "upper_bound" "value_counts" "var" - [184] "xor" + [67] "gt_eq" "has_nulls" "hash" + [70] "head" "implode" "inspect" + [73] "interpolate" "is_between" "is_duplicated" + [76] "is_finite" "is_first_distinct" "is_in" + [79] "is_infinite" "is_last_distinct" "is_nan" + [82] "is_not_nan" "is_not_null" "is_null" + [85] "is_unique" "kurtosis" "last" + [88] "len" "limit" "list" + [91] "log" "log10" "lower_bound" + [94] "lt" "lt_eq" "map_batches" + [97] "map_elements" "max" "mean" + [100] "median" "meta" "min" + [103] "mod" "mode" "mul" + [106] "n_unique" "name" "nan_max" + [109] "nan_min" "neq" "neq_missing" + [112] "not" "null_count" "or" + [115] "over" "pct_change" "peak_max" + [118] "peak_min" "pow" "print" + [121] "product" "qcut" "quantile" + [124] "rank" "rechunk" "reinterpret" + [127] "rep" "repeat_by" "replace" + [130] "reshape" "reverse" "rle" + [133] "rle_id" "rolling" "rolling_max" + [136] "rolling_max_by" "rolling_mean" "rolling_mean_by" + [139] "rolling_median" "rolling_median_by" "rolling_min" + [142] "rolling_min_by" "rolling_quantile" "rolling_quantile_by" + [145] "rolling_skew" "rolling_std" "rolling_std_by" + [148] "rolling_sum" "rolling_sum_by" "rolling_var" + [151] "rolling_var_by" "round" "sample" + [154] "search_sorted" "set_sorted" "shift" + [157] "shift_and_fill" "shrink_dtype" "shuffle" + [160] "sign" "sin" "sinh" + [163] "skew" "slice" "sort" + [166] "sort_by" "sqrt" "std" + [169] "str" "struct" "sub" + [172] "sum" "tail" "tan" + [175] "tanh" "to_physical" "to_r" + [178] "to_series" "top_k" "unique" + [181] "unique_counts" "upper_bound" "value_counts" + [184] "var" "xor" --- @@ -491,46 +491,47 @@ [58] "filter" "first" "flatten" [61] "floor" "floor_div" "forward_fill" [64] "gather" "gather_every" "gt" - [67] "gt_eq" "hash" "head" - [70] "implode" "inspect" "interpolate" - [73] "is_between" "is_duplicated" "is_finite" - [76] "is_first_distinct" "is_in" "is_infinite" - [79] "is_last_distinct" "is_nan" "is_not_nan" - [82] "is_not_null" "is_null" "is_unique" - [85] "kurtosis" "last" "len" - [88] "limit" "list" "log" - [91] "log10" "lower_bound" "lt" - [94] "lt_eq" "map_batches" "map_elements" - [97] "max" "mean" "median" - [100] "meta" "min" "mod" - [103] "mode" "mul" "n_unique" - [106] "name" "nan_max" "nan_min" - [109] "neq" "neq_missing" "not" - [112] "null_count" "or" "otherwise" - [115] "over" "pct_change" "peak_max" - [118] "peak_min" "pow" "print" - [121] "product" "qcut" "quantile" - [124] "rank" "rechunk" "reinterpret" - [127] "rep" "repeat_by" "replace" - [130] "reshape" "reverse" "rle" - [133] "rle_id" "rolling" "rolling_max" - [136] "rolling_max_by" "rolling_mean" "rolling_mean_by" - [139] "rolling_median" "rolling_median_by" "rolling_min" - [142] "rolling_min_by" "rolling_quantile" "rolling_quantile_by" - [145] "rolling_skew" "rolling_std" "rolling_std_by" - [148] "rolling_sum" "rolling_sum_by" "rolling_var" - [151] "rolling_var_by" "round" "sample" - [154] "search_sorted" "set_sorted" "shift" - [157] "shift_and_fill" "shrink_dtype" "shuffle" - [160] "sign" "sin" "sinh" - [163] "skew" "slice" "sort" - [166] "sort_by" "sqrt" "std" - [169] "str" "struct" "sub" - [172] "sum" "tail" "tan" - [175] "tanh" "to_physical" "to_r" - [178] "to_series" "top_k" "unique" - [181] "unique_counts" "upper_bound" "value_counts" - [184] "var" "when" "xor" + [67] "gt_eq" "has_nulls" "hash" + [70] "head" "implode" "inspect" + [73] "interpolate" "is_between" "is_duplicated" + [76] "is_finite" "is_first_distinct" "is_in" + [79] "is_infinite" "is_last_distinct" "is_nan" + [82] "is_not_nan" "is_not_null" "is_null" + [85] "is_unique" "kurtosis" "last" + [88] "len" "limit" "list" + [91] "log" "log10" "lower_bound" + [94] "lt" "lt_eq" "map_batches" + [97] "map_elements" "max" "mean" + [100] "median" "meta" "min" + [103] "mod" "mode" "mul" + [106] "n_unique" "name" "nan_max" + [109] "nan_min" "neq" "neq_missing" + [112] "not" "null_count" "or" + [115] "otherwise" "over" "pct_change" + [118] "peak_max" "peak_min" "pow" + [121] "print" "product" "qcut" + [124] "quantile" "rank" "rechunk" + [127] "reinterpret" "rep" "repeat_by" + [130] "replace" "reshape" "reverse" + [133] "rle" "rle_id" "rolling" + [136] "rolling_max" "rolling_max_by" "rolling_mean" + [139] "rolling_mean_by" "rolling_median" "rolling_median_by" + [142] "rolling_min" "rolling_min_by" "rolling_quantile" + [145] "rolling_quantile_by" "rolling_skew" "rolling_std" + [148] "rolling_std_by" "rolling_sum" "rolling_sum_by" + [151] "rolling_var" "rolling_var_by" "round" + [154] "sample" "search_sorted" "set_sorted" + [157] "shift" "shift_and_fill" "shrink_dtype" + [160] "shuffle" "sign" "sin" + [163] "sinh" "skew" "slice" + [166] "sort" "sort_by" "sqrt" + [169] "std" "str" "struct" + [172] "sub" "sum" "tail" + [175] "tan" "tanh" "to_physical" + [178] "to_r" "to_series" "top_k" + [181] "unique" "unique_counts" "upper_bound" + [184] "value_counts" "var" "when" + [187] "xor" --- @@ -580,46 +581,47 @@ [58] "filter" "first" "flatten" [61] "floor" "floor_div" "forward_fill" [64] "gather" "gather_every" "gt" - [67] "gt_eq" "hash" "head" - [70] "implode" "inspect" "interpolate" - [73] "is_between" "is_duplicated" "is_finite" - [76] "is_first_distinct" "is_in" "is_infinite" - [79] "is_last_distinct" "is_nan" "is_not_nan" - [82] "is_not_null" "is_null" "is_unique" - [85] "kurtosis" "last" "len" - [88] "limit" "list" "log" - [91] "log10" "lower_bound" "lt" - [94] "lt_eq" "map_batches" "map_elements" - [97] "max" "mean" "median" - [100] "meta" "min" "mod" - [103] "mode" "mul" "n_unique" - [106] "name" "nan_max" "nan_min" - [109] "neq" "neq_missing" "not" - [112] "null_count" "or" "otherwise" - [115] "over" "pct_change" "peak_max" - [118] "peak_min" "pow" "print" - [121] "product" "qcut" "quantile" - [124] "rank" "rechunk" "reinterpret" - [127] "rep" "repeat_by" "replace" - [130] "reshape" "reverse" "rle" - [133] "rle_id" "rolling" "rolling_max" - [136] "rolling_max_by" "rolling_mean" "rolling_mean_by" - [139] "rolling_median" "rolling_median_by" "rolling_min" - [142] "rolling_min_by" "rolling_quantile" "rolling_quantile_by" - [145] "rolling_skew" "rolling_std" "rolling_std_by" - [148] "rolling_sum" "rolling_sum_by" "rolling_var" - [151] "rolling_var_by" "round" "sample" - [154] "search_sorted" "set_sorted" "shift" - [157] "shift_and_fill" "shrink_dtype" "shuffle" - [160] "sign" "sin" "sinh" - [163] "skew" "slice" "sort" - [166] "sort_by" "sqrt" "std" - [169] "str" "struct" "sub" - [172] "sum" "tail" "tan" - [175] "tanh" "to_physical" "to_r" - [178] "to_series" "top_k" "unique" - [181] "unique_counts" "upper_bound" "value_counts" - [184] "var" "when" "xor" + [67] "gt_eq" "has_nulls" "hash" + [70] "head" "implode" "inspect" + [73] "interpolate" "is_between" "is_duplicated" + [76] "is_finite" "is_first_distinct" "is_in" + [79] "is_infinite" "is_last_distinct" "is_nan" + [82] "is_not_nan" "is_not_null" "is_null" + [85] "is_unique" "kurtosis" "last" + [88] "len" "limit" "list" + [91] "log" "log10" "lower_bound" + [94] "lt" "lt_eq" "map_batches" + [97] "map_elements" "max" "mean" + [100] "median" "meta" "min" + [103] "mod" "mode" "mul" + [106] "n_unique" "name" "nan_max" + [109] "nan_min" "neq" "neq_missing" + [112] "not" "null_count" "or" + [115] "otherwise" "over" "pct_change" + [118] "peak_max" "peak_min" "pow" + [121] "print" "product" "qcut" + [124] "quantile" "rank" "rechunk" + [127] "reinterpret" "rep" "repeat_by" + [130] "replace" "reshape" "reverse" + [133] "rle" "rle_id" "rolling" + [136] "rolling_max" "rolling_max_by" "rolling_mean" + [139] "rolling_mean_by" "rolling_median" "rolling_median_by" + [142] "rolling_min" "rolling_min_by" "rolling_quantile" + [145] "rolling_quantile_by" "rolling_skew" "rolling_std" + [148] "rolling_std_by" "rolling_sum" "rolling_sum_by" + [151] "rolling_var" "rolling_var_by" "round" + [154] "sample" "search_sorted" "set_sorted" + [157] "shift" "shift_and_fill" "shrink_dtype" + [160] "shuffle" "sign" "sin" + [163] "sinh" "skew" "slice" + [166] "sort" "sort_by" "sqrt" + [169] "std" "str" "struct" + [172] "sub" "sum" "tail" + [175] "tan" "tanh" "to_physical" + [178] "to_r" "to_series" "top_k" + [181] "unique" "unique_counts" "upper_bound" + [184] "value_counts" "var" "when" + [187] "xor" --- @@ -672,47 +674,47 @@ [64] "flags" "flatten" "floor" [67] "floor_div" "forward_fill" "gather" [70] "gather_every" "gt" "gt_eq" - [73] "hash" "head" "implode" - [76] "interpolate" "is_between" "is_duplicated" - [79] "is_finite" "is_first_distinct" "is_in" - [82] "is_infinite" "is_last_distinct" "is_nan" - [85] "is_not_nan" "is_not_null" "is_null" - [88] "is_numeric" "is_sorted" "is_unique" - [91] "item" "kurtosis" "last" - [94] "len" "limit" "list" - [97] "log" "log10" "lower_bound" - [100] "lt" "lt_eq" "map_batches" - [103] "map_elements" "max" "mean" - [106] "median" "min" "mod" - [109] "mode" "mul" "n_chunks" - [112] "n_unique" "name" "nan_max" - [115] "nan_min" "neq" "neq_missing" - [118] "not" "null_count" "or" - [121] "pct_change" "peak_max" "peak_min" - [124] "pow" "print" "product" - [127] "qcut" "quantile" "rank" - [130] "rechunk" "reinterpret" "rename" - [133] "rep" "repeat_by" "replace" - [136] "reshape" "reverse" "rle" - [139] "rle_id" "rolling_max" "rolling_max_by" - [142] "rolling_mean" "rolling_mean_by" "rolling_median" - [145] "rolling_median_by" "rolling_min" "rolling_min_by" - [148] "rolling_quantile" "rolling_quantile_by" "rolling_skew" - [151] "rolling_std" "rolling_std_by" "rolling_sum" - [154] "rolling_sum_by" "rolling_var" "rolling_var_by" - [157] "round" "sample" "search_sorted" - [160] "set_sorted" "shape" "shift" - [163] "shift_and_fill" "shrink_dtype" "shuffle" - [166] "sign" "sin" "sinh" - [169] "skew" "slice" "sort" - [172] "sort_by" "sqrt" "std" - [175] "str" "struct" "sub" - [178] "sum" "tail" "tan" - [181] "tanh" "to_frame" "to_list" - [184] "to_lit" "to_physical" "to_r" - [187] "to_vector" "top_k" "unique" - [190] "unique_counts" "upper_bound" "value_counts" - [193] "var" "xor" + [73] "has_nulls" "hash" "head" + [76] "implode" "interpolate" "is_between" + [79] "is_duplicated" "is_finite" "is_first_distinct" + [82] "is_in" "is_infinite" "is_last_distinct" + [85] "is_nan" "is_not_nan" "is_not_null" + [88] "is_null" "is_numeric" "is_sorted" + [91] "is_unique" "item" "kurtosis" + [94] "last" "len" "limit" + [97] "list" "log" "log10" + [100] "lower_bound" "lt" "lt_eq" + [103] "map_batches" "map_elements" "max" + [106] "mean" "median" "min" + [109] "mod" "mode" "mul" + [112] "n_chunks" "n_unique" "name" + [115] "nan_max" "nan_min" "neq" + [118] "neq_missing" "not" "null_count" + [121] "or" "pct_change" "peak_max" + [124] "peak_min" "pow" "print" + [127] "product" "qcut" "quantile" + [130] "rank" "rechunk" "reinterpret" + [133] "rename" "rep" "repeat_by" + [136] "replace" "reshape" "reverse" + [139] "rle" "rle_id" "rolling_max" + [142] "rolling_max_by" "rolling_mean" "rolling_mean_by" + [145] "rolling_median" "rolling_median_by" "rolling_min" + [148] "rolling_min_by" "rolling_quantile" "rolling_quantile_by" + [151] "rolling_skew" "rolling_std" "rolling_std_by" + [154] "rolling_sum" "rolling_sum_by" "rolling_var" + [157] "rolling_var_by" "round" "sample" + [160] "search_sorted" "set_sorted" "shape" + [163] "shift" "shift_and_fill" "shrink_dtype" + [166] "shuffle" "sign" "sin" + [169] "sinh" "skew" "slice" + [172] "sort" "sort_by" "sqrt" + [175] "std" "str" "struct" + [178] "sub" "sum" "tail" + [181] "tan" "tanh" "to_frame" + [184] "to_list" "to_lit" "to_physical" + [187] "to_r" "to_vector" "top_k" + [190] "unique" "unique_counts" "upper_bound" + [193] "value_counts" "var" "xor" --- diff --git a/tests/testthat/test-expr_expr.R b/tests/testthat/test-expr_expr.R index 16c1acd29..19ad7e47d 100644 --- a/tests/testthat/test-expr_expr.R +++ b/tests/testthat/test-expr_expr.R @@ -2982,3 +2982,16 @@ test_that("all works", { list(a = TRUE, b = FALSE, c = NA, d = NA) ) }) + +test_that("has_nulls works", { + df = pl$DataFrame( + a = c(NA, 1, NA), + b = c(1, NA, 2), + c = c(1, 2, 3) + ) + + expect_identical( + df$select(pl$all()$has_nulls())$to_list(), + list(a = TRUE, b = TRUE, c = FALSE) + ) +})