From ec81870f6809a3907aada976a050408dbf43e6ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Havelund=20Welling?= Date: Mon, 9 Oct 2023 10:42:17 +0300 Subject: [PATCH] add support for pl$concat(, . . . ) + add `to_supertypes` auto casting (#407) Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> --- NEWS.md | 1 + R/dataframe__frame.R | 87 +++++++++++++++++++- R/error__rpolarserr.R | 13 ++- R/expr__meta.R | 3 +- R/extendr-wrappers.R | 30 ++++--- R/functions__eager.R | 139 +++++++++++++++++++++++++------- R/lazyframe__lazy.R | 12 ++- R/series__series.R | 2 +- R/utils.R | 9 ++- man/DataFrame_n_chunks.Rd | 77 ++++++++++++++++++ man/DataFrame_rechunk.Rd | 74 +++++++++++++++++ man/LazyFrame_unnest.Rd | 8 +- man/Series_dtype.Rd | 11 --- man/Series_flags.Rd | 21 +++++ man/pl_concat.Rd | 33 ++++++-- src/rust/src/concat.rs | 94 +++++++++++++++++++++ src/rust/src/lib.rs | 3 + src/rust/src/rdataframe/mod.rs | 28 ++++++- src/rust/src/rlib.rs | 51 +----------- src/rust/src/utils/mod.rs | 102 +++++++++++++++++++++-- tests/testthat/test-concat.R | 84 +++++++++++++++++-- tests/testthat/test-dataframe.R | 20 ++++- tests/testthat/test-expr_arr.R | 1 - tests/testthat/test-lazy.R | 12 +-- 24 files changed, 761 insertions(+), 154 deletions(-) create mode 100644 man/DataFrame_n_chunks.Rd create mode 100644 man/DataFrame_rechunk.Rd create mode 100644 man/Series_flags.Rd create mode 100644 src/rust/src/concat.rs diff --git a/NEWS.md b/NEWS.md index 11fbc3aac..9201a7a69 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,7 @@ ## What's changed +- `pl$concat()` now also supports `Series`, `Expr` and `LazyFrame` (#407). - New method `$unnest()` for `LazyFrame` (#397). - New method `$sample()` for `DataFrame` (#399). - New method `$meta$tree_format()` to display an `Expr` as a tree (#401). diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 2fb60866e..18f970061 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -133,7 +133,6 @@ DataFrame #' #' # custom schema #' pl$DataFrame(iris, schema = list(Sepal.Length = pl$Float32, Species = pl$Utf8)) - pl$DataFrame = function(..., make_names_unique = TRUE, schema = NULL) { largs = unpack_list(...) @@ -181,9 +180,9 @@ pl$DataFrame = function(..., make_names_unique = TRUE, schema = NULL) { names(largs) = keys lapply(seq_along(largs), \(x) { varname = keys[x] - out <- pl$lit(largs[[x]]) + out = pl$lit(largs[[x]]) if (!is.null(schema) && varname %in% names(schema)) { - out <- out$cast(schema[[varname]], strict = TRUE) + out = out$cast(schema[[varname]], strict = TRUE) } out$alias(varname) }) |> @@ -1033,6 +1032,88 @@ DataFrame_first = function() { self$lazy()$first()$collect() } + +#' @title Number of chunks of the Series in a DataFrame +#' @description +#' Number of chunks (memory allocations) for all or first Series in a DataFrame. +#' @details +#' A DataFrame is a vector of Series. Each Series in rust-polars is a wrapper +#' around a ChunkedArray, which is like a virtual contiguous vector physically +#' backed by an ordered set of chunks. Each chunk of values has a contiguous +#' memory layout and is an arrow array. Arrow arrays are a fast, thread-safe and +#' cross-platform memory layout. +#' +#' In R, combining with `c()` or `rbind()` requires immediate vector re-allocation +#' to place vector values in contiguous memory. This is slow and memory consuming, +#' and it is why repeatedly appending to a vector in R is discouraged. +#' +#' In polars, when we concatenate or append to Series or DataFrame, the +#' re-allocation can be avoided or delayed by simply appending chunks to each +#' individual Series. However, if chunks become many and small or are misaligned +#' across Series, this can hurt the performance of subsequent operations. +#' +#' Most places in the polars api where chunking could occur, the user have to +#' typically actively opt-out by setting an argument `rechunk = FALSE`. +#' +#' @keywords DataFrame +#' @param strategy Either `"all"` or `"first"`. `"first"` only returns chunks +#' for the first Series. +#' @return A real vector of chunk counts per Series. +#' @seealso [`$rechunk()`][DataFrame_rechunk] +#' @examples +#' # create DataFrame with misaligned chunks +#' df = pl$concat( +#' 1:10, # single chunk +#' pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks +#' how = "horizontal" +#' ) +#' df +#' df$n_chunks() +#' +#' # rechunk a chunked DataFrame +#' df$rechunk()$n_chunks() +#' +#' # rechunk is not an in-place operation +#' df$n_chunks() +#' +#' # The following toy example emulates the Series "chunkyness" in R. Here it a +#' # S3-classed list with same type of vectors and where have all relevant S3 +#' # generics implemented to make behave as if it was a regular vector. +#' "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") +#' print.chunked_vector = \(x, ...) print(unlist(x), ...) +#' c.chunked_vector = \(...) { +#' structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +#' } +#' rechunk = \(x) structure(unlist(x), class = "chunked_vector") +#' x = structure(list(1:4, 5L), class = "chunked_vector") +#' x +#' x + 5:1 +#' lapply(x, tracemem) # trace chunks to verify no re-allocation +#' z = c(x, x) +#' z # looks like a plain vector +#' lapply(z, tracemem) # mem allocation in z are the same from x +#' str(z) +#' z = rechunk(z) +#' str(z) +DataFrame_n_chunks = function(strategy = "all") { + .pr$DataFrame$n_chunks(self, strategy) |> + unwrap("in n_chunks():") +} + + +#' @title Rechunk a DataFrame +#' @description Rechunking re-allocates any "chunked" memory allocations to +#' speed-up e.g. vectorized operations. +#' @inherit DataFrame_n_chunks details examples +#' +#' @keywords DataFrame +#' @return A DataFrame +#' @seealso [`$n_chunks()`][DataFrame_n_chunks] +DataFrame_rechunk = function() { + .pr$DataFrame$rechunk(self) +} + + #' @title Get the last row of the DataFrame. #' @keywords DataFrame #' @return A DataFrame with one row. diff --git a/R/error__rpolarserr.R b/R/error__rpolarserr.R index 37a86b041..b851e3bb9 100644 --- a/R/error__rpolarserr.R +++ b/R/error__rpolarserr.R @@ -53,12 +53,19 @@ bad_robj = function(r) { .pr$RPolarsErr$new()$bad_robj(r) } -Err_plain = function(x) { - Err(.pr$RPolarsErr$new()$plain(x)) +Err_plain = function(...) { + Err(.pr$RPolarsErr$new()$plain(paste(..., collapse = " "))) } # short hand for extracting an error context in unit testing, will raise error if not an RPolarsErr -get_err_ctx = \(x) unwrap_err(result(x))$contexts() +get_err_ctx = \(x, select = NULL) { + ctx = unwrap_err(result(x))$contexts() + if (is.null(select)) { + ctx + } else { + ctx[[match.arg(select, names(ctx))]] + } +} # wrapper to return Result diff --git a/R/expr__meta.R b/R/expr__meta.R index 76a6e7d8d..d88f7026c 100644 --- a/R/expr__meta.R +++ b/R/expr__meta.R @@ -166,9 +166,8 @@ ExprMeta_is_regex_projection = function() { #' @examples #' my_expr = (pl$col("foo") * pl$col("bar"))$sum()$over(pl$col("ham")) / 2 #' my_expr$meta$tree_format() - ExprMeta_tree_format = function(return_as_string = FALSE) { - out <- .pr$Expr$meta_tree_format(self) |> + out = .pr$Expr$meta_tree_format(self) |> unwrap("in $tree_format():") if (isTRUE(return_as_string)) { out diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 6c6634baf..1a4ca3248 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -11,18 +11,6 @@ #' @useDynLib polars, .registration = TRUE NULL -rlazy_csv_reader <- function(path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates) .Call(wrap__rlazy_csv_reader, path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates) - -import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_count, memmap) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_count, memmap) - -new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory) - -concat_df <- function(vdf) .Call(wrap__concat_df, vdf) - -hor_concat_df <- function(dfs) .Call(wrap__hor_concat_df, dfs) - -diag_concat_df <- function(dfs) .Call(wrap__diag_concat_df, dfs) - min_exprs <- function(exprs) .Call(wrap__min_exprs, exprs) max_exprs <- function(exprs) .Call(wrap__max_exprs, exprs) @@ -75,6 +63,20 @@ test_wrong_call_pl_lit <- function(robj) .Call(wrap__test_wrong_call_pl_lit, rob polars_features <- function() .Call(wrap__polars_features) +concat_lf <- function(l, rechunk, parallel, to_supertypes) .Call(wrap__concat_lf, l, rechunk, parallel, to_supertypes) + +diag_concat_lf <- function(l, rechunk, parallel) .Call(wrap__diag_concat_lf, l, rechunk, parallel) + +hor_concat_df <- function(l) .Call(wrap__hor_concat_df, l) + +concat_series <- function(l, rechunk, to_supertypes) .Call(wrap__concat_series, l, rechunk, to_supertypes) + +rlazy_csv_reader <- function(path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates) .Call(wrap__rlazy_csv_reader, path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates) + +import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_count, memmap) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_count, memmap) + +new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory) + test_rpolarserr <- function() .Call(wrap__test_rpolarserr) setup_renv <- function() .Call(wrap__setup_renv) @@ -111,6 +113,10 @@ DataFrame <- new.env(parent = emptyenv()) DataFrame$shape <- function() .Call(wrap__DataFrame__shape, self) +DataFrame$n_chunks <- function(strategy) .Call(wrap__DataFrame__n_chunks, self, strategy) + +DataFrame$rechunk <- function() .Call(wrap__DataFrame__rechunk, self) + DataFrame$clone_see_me_macro <- function() .Call(wrap__DataFrame__clone_see_me_macro, self) DataFrame$default <- function() .Call(wrap__DataFrame__default) diff --git a/R/functions__eager.R b/R/functions__eager.R index 7e98573c8..d925d8582 100644 --- a/R/functions__eager.R +++ b/R/functions__eager.R @@ -1,13 +1,24 @@ #' Concat polars objects #' @name pl_concat -#' @param l list of DataFrame, or Series, LazyFrame or Expr -#' @param rechunk perform a rechunk at last -#' @param how choice of bind direction "vertical"(rbind) "horizontal"(cbind) "diagonal" diagonally -#' @param parallel BOOL default TRUE, only used for LazyFrames +#' @param ... Either individual unpacked args or args wrapped in list(). Args can +#' be eager as DataFrame, Series and R vectors, or lazy as LazyFrame and Expr. +#' The first element determines the output of `$concat()`: if the first element +#' is lazy, a LazyFrame is returned; otherwise, a DataFrame is returned (note +#' that if the first element is eager, all other elements have to be eager to +#' avoid implicit collect). +#' @param rechunk Perform a rechunk at last. +#' @param how Bind direction. Can be "vertical" (like `rbind()`), "horizontal" +#' (like `cbind()`), or "diagonal". +#' @param parallel Only used for LazyFrames. If `TRUE` (default), lazy +#' computations may be executed in parallel. +#' @param to_supertypes If `TRUE` (default), cast columns shared super types, if +#' any. For example, if we try to vertically concatenate two columns of types `i32` +#' and `f64`, the column of type `i32` will be cast to `f64` beforehand. This +#' argument is equivalent to the "_relaxed" operations in Python polars. #' #' @details -#' Categorical columns/Series must have been constructed while global string cache enabled -#' [`pl$enable_string_cache()`][pl_enable_string_cache] +#' Categorical columns/Series must have been constructed while global string +#' cache enabled. See [`pl$enable_string_cache()`][pl_enable_string_cache]. #' #' #' @return DataFrame, or Series, LazyFrame or Expr @@ -23,7 +34,6 @@ #' }) #' pl$concat(l_ver, how = "vertical") #' -#' #' # horizontal #' l_hor = lapply(1:10, function(i) { #' l_internal = list( @@ -34,44 +44,116 @@ #' pl$DataFrame(l_internal) #' }) #' pl$concat(l_hor, how = "horizontal") +#' #' # diagonal #' pl$concat(l_hor, how = "diagonal") +#' +#' # if two columns don't share the same type, concat() will error unless we use +#' # `to_supertypes = TRUE`: +#' test = pl$DataFrame(x = 1L) # i32 +#' test2 = pl$DataFrame(x = 1.0) #f64 +#' +#' pl$concat(test, test2, to_supertypes = TRUE) pl$concat = function( - l, # list of DataFrames or Series or lazyFrames or expr + ..., # list of DataFrames or Series or lazyFrames or expr rechunk = TRUE, how = c("vertical", "horizontal", "diagonal"), - parallel = TRUE # not used yet - ) { + parallel = TRUE, + to_supertypes = FALSE) { + # unpack arg list + l = unpack_list(..., skip_classes = "data.frame") + + # nothing becomes NULL + if (length(l) == 0L) { + return(NULL) + } + ## Check inputs - how = match.arg(how[1L], c("vertical", "horizontal", "diagonal")) + how_args = c("vertical", "horizontal", "diagonal") # , "vertical_relaxed", "diangonal_relaxed") + + how = match.arg(how[1L], how_args) |> + result() |> + unwrap("in pl$concat()") - # dispatch on item class and how first = l[[1L]] - result = pcase( - inherits(first, "DataFrame"), + eager = !inherits(first, "LazyFrame") + args_modified = names(as.list(sys.call()[-1L])) + + # check not using any mixing of types which could lead to implicit collect + if (eager) { + for (i in seq_along(l)) { + if (inherits(l[[i]], c("LazyFrame", "Expr"))) { + .pr$RPolarsErr$new()$ + plain("tip: explicitly collect lazy inputs first, e.g. pl$concat(dataframe, lazyframe$collect())")$ + plain("LazyFrame or Expr not allowed if first arg is a DataFrame, to avoid implicit collect")$ + bad_robj(l[[i]])$ + bad_arg(paste("of those to concatenate, number", i)) |> + Err() |> + unwrap("in pl$concat()") + } + } + } + + # dispatch on item class and how + Result_out = pcase( + how == "vertical" && (inherits(first, "Series") || is.vector(first)), { - vdf = l_to_vdf(l) - pcase( - how == "vertical", concat_df(vdf), - how == "diagonal", diag_concat_df(vdf), - how == "horizontal", hor_concat_df(vdf), - or_else = stopf("Internal error") - ) + if (any(args_modified %in% c("parallel"))) { + warning( + "in pl$concat(): argument `parallel` is not used when concatenating Series", + call. = FALSE + ) + } + concat_series(l, rechunk, to_supertypes) }, - inherits(first, "Series"), + how == "vertical", + concat_lf(l, rechunk, parallel, to_supertypes), + how == "diagonal", { - stopf("not implemented Series") + if (any(args_modified %in% c("to_supertypes"))) { + warning( + "Argument `to_supertypes` is not used when how=='diagonal'", + call. = FALSE + ) + } + diag_concat_lf(l, rechunk, parallel) }, - inherits(first, "Expr"), + how == "horizontal" && !eager, { - stopf("not implemented Expr") + Err_plain( + "how=='horizontal' is not supported for lazy (first element is LazyFrame).", + "Try e.g. $join() to get Lazy join or pl$concat(lf1$collect(), lf2, lf3).", + "to get a eager horizontal concatenation" + ) + }, + how == "horizontal", + { + if (any(args_modified %in% c("parallel", "to_supertypes"))) { + warning( + "Arguments `parallel`, `rechunk`, `eager` and `to_supertypes` are not used when how=='horizontal'", + call. = FALSE + ) + } + hor_concat_df(l) }, # TODO implement Series, Expr, Lazy etc - or_else = stopf(paste0("type of first list element: '", class(first), "' is not supported")) + or_else = Err_plain("internal error:", how, "not handled") ) - unwrap(result) + # convert back from lazy if eager + and_then(Result_out, \(x) { + pcase( + # run-time assertion for future changes + inherits(x, "DataFrame") && !eager, Err_plain("internal logical error in pl$concat()"), + + # must collect as in rust side only lazy concat is implemented. Eager inputs are wrapped in + # lazy and then collected again. This does not mean any user input is collected. + inherits(x, "LazyFrame") && eager, Ok(x$collect()), + or_else = Ok(x) + ) + }) |> + unwrap("in pl$concat()") } @@ -136,8 +218,7 @@ pl$date_range = function( name = NULL, # : str | None = None, time_unit = "us", time_zone = NULL, # : str | None = None - explode = TRUE - ) { + explode = TRUE) { if (missing(end)) { end = start interval = "1h" diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 223feb709..6b556b001 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -146,7 +146,6 @@ LazyFrame #' iris, #' schema = list(Sepal.Length = pl$Float32, Species = pl$Utf8) #' )$collect() - pl$LazyFrame = function(...) { pl$DataFrame(...)$lazy() } @@ -1326,10 +1325,10 @@ LazyFrame_clone = function() { #' b = c("one", "two", "three", "four", "five"), #' c = 6:10 #' )$ -#' select( -#' pl$col("b")$to_struct(), -#' pl$col("a", "c")$to_struct()$alias("a_and_c") -#' ) +#' select( +#' pl$col("b")$to_struct(), +#' pl$col("a", "c")$to_struct()$alias("a_and_c") +#' ) #' lf$collect() #' #' # by default, all struct columns are unnested @@ -1337,10 +1336,9 @@ LazyFrame_clone = function() { #' #' # we can specify specific columns to unnest #' lf$unnest("a_and_c")$collect() - LazyFrame_unnest = function(names = NULL) { if (is.null(names)) { - names <- names(which(dtypes_are_struct(.pr$LazyFrame$schema(self)$ok))) + names = names(which(dtypes_are_struct(.pr$LazyFrame$schema(self)$ok))) } unwrap(.pr$LazyFrame$unnest(self, names), "in $unnest():") } diff --git a/R/series__series.R b/R/series__series.R index c06c97e8b..62f121ca8 100644 --- a/R/series__series.R +++ b/R/series__series.R @@ -759,7 +759,7 @@ Series_dtype = method_as_property(function() { #' @keywords Series #' @return DataType #' @aliases Series_flags -#' @name Series_dtype +#' @name Series_flags #' @details property sorted flags are not settable, use set_sorted #' @examples #' pl$Series(1:4)$sort()$flags diff --git a/R/utils.R b/R/utils.R index 5e7f071bd..cf801735d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -93,6 +93,7 @@ list2 = list #' Internal unpack list #' @noRd #' @param l any list +#' @param skip_classes char vec, do not unpack list inherits skip_classes. #' @details py-polars syntax only allows e.g. `df.select([expr1, expr2,])` and not #' `df.select(expr1, expr2,)`. r-polars also allows user to directly write #' `df$select(expr1, expr2)` or `df$select(list(expr1,expr2))`. Unpack list @@ -103,9 +104,13 @@ list2 = list #' f = \(...) unpack_list(list(...)) #' identical(f(list(1L, 2L, 3L)), f(1L, 2L, 3L)) # is TRUE #' identical(f(list(1L, 2L), 3L), f(1L, 2L, 3L)) # is FALSE -unpack_list = function(...) { +unpack_list = function(..., skip_classes = NULL) { l = list2(...) - if (length(l) == 1L && is.list(l[[1L]])) { + if ( + length(l) == 1L && + is.list(l[[1L]]) && + !(!is.null(skip_classes) && inherits(l[[1L]], skip_classes)) + ) { l[[1L]] } else { l diff --git a/man/DataFrame_n_chunks.Rd b/man/DataFrame_n_chunks.Rd new file mode 100644 index 000000000..4fe2e50cc --- /dev/null +++ b/man/DataFrame_n_chunks.Rd @@ -0,0 +1,77 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe__frame.R +\name{DataFrame_n_chunks} +\alias{DataFrame_n_chunks} +\title{Number of chunks of the Series in a DataFrame} +\usage{ +DataFrame_n_chunks(strategy = "all") +} +\arguments{ +\item{strategy}{Either \code{"all"} or \code{"first"}. \code{"first"} only returns chunks +for the first Series.} +} +\value{ +A real vector of chunk counts per Series. +} +\description{ +Number of chunks (memory allocations) for all or first Series in a DataFrame. +} +\details{ +A DataFrame is a vector of Series. Each Series in rust-polars is a wrapper +around a ChunkedArray, which is like a virtual contiguous vector physically +backed by an ordered set of chunks. Each chunk of values has a contiguous +memory layout and is an arrow array. Arrow arrays are a fast, thread-safe and +cross-platform memory layout. + +In R, combining with \code{c()} or \code{rbind()} requires immediate vector re-allocation +to place vector values in contiguous memory. This is slow and memory consuming, +and it is why repeatedly appending to a vector in R is discouraged. + +In polars, when we concatenate or append to Series or DataFrame, the +re-allocation can be avoided or delayed by simply appending chunks to each +individual Series. However, if chunks become many and small or are misaligned +across Series, this can hurt the performance of subsequent operations. + +Most places in the polars api where chunking could occur, the user have to +typically actively opt-out by setting an argument \code{rechunk = FALSE}. +} +\examples{ +# create DataFrame with misaligned chunks +df = pl$concat( + 1:10, # single chunk + pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks + how = "horizontal" +) +df +df$n_chunks() + +# rechunk a chunked DataFrame +df$rechunk()$n_chunks() + +# rechunk is not an in-place operation +df$n_chunks() + +# The following toy example emulates the Series "chunkyness" in R. Here it a +# S3-classed list with same type of vectors and where have all relevant S3 +# generics implemented to make behave as if it was a regular vector. +"+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") +print.chunked_vector = \(x, ...) print(unlist(x), ...) +c.chunked_vector = \(...) { + structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +} +rechunk = \(x) structure(unlist(x), class = "chunked_vector") +x = structure(list(1:4, 5L), class = "chunked_vector") +x +x + 5:1 +lapply(x, tracemem) # trace chunks to verify no re-allocation +z = c(x, x) +z # looks like a plain vector +lapply(z, tracemem) # mem allocation in z are the same from x +str(z) +z = rechunk(z) +str(z) +} +\seealso{ +\code{\link[=DataFrame_rechunk]{$rechunk()}} +} +\keyword{DataFrame} diff --git a/man/DataFrame_rechunk.Rd b/man/DataFrame_rechunk.Rd new file mode 100644 index 000000000..25606064c --- /dev/null +++ b/man/DataFrame_rechunk.Rd @@ -0,0 +1,74 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe__frame.R +\name{DataFrame_rechunk} +\alias{DataFrame_rechunk} +\title{Rechunk a DataFrame} +\usage{ +DataFrame_rechunk() +} +\value{ +A DataFrame +} +\description{ +Rechunking re-allocates any "chunked" memory allocations to +speed-up e.g. vectorized operations. +} +\details{ +A DataFrame is a vector of Series. Each Series in rust-polars is a wrapper +around a ChunkedArray, which is like a virtual contiguous vector physically +backed by an ordered set of chunks. Each chunk of values has a contiguous +memory layout and is an arrow array. Arrow arrays are a fast, thread-safe and +cross-platform memory layout. + +In R, combining with \code{c()} or \code{rbind()} requires immediate vector re-allocation +to place vector values in contiguous memory. This is slow and memory consuming, +and it is why repeatedly appending to a vector in R is discouraged. + +In polars, when we concatenate or append to Series or DataFrame, the +re-allocation can be avoided or delayed by simply appending chunks to each +individual Series. However, if chunks become many and small or are misaligned +across Series, this can hurt the performance of subsequent operations. + +Most places in the polars api where chunking could occur, the user have to +typically actively opt-out by setting an argument \code{rechunk = FALSE}. +} +\examples{ +# create DataFrame with misaligned chunks +df = pl$concat( + 1:10, # single chunk + pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks + how = "horizontal" +) +df +df$n_chunks() + +# rechunk a chunked DataFrame +df$rechunk()$n_chunks() + +# rechunk is not an in-place operation +df$n_chunks() + +# The following toy example emulates the Series "chunkyness" in R. Here it a +# S3-classed list with same type of vectors and where have all relevant S3 +# generics implemented to make behave as if it was a regular vector. +"+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector") +print.chunked_vector = \(x, ...) print(unlist(x), ...) +c.chunked_vector = \(...) { + structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector") +} +rechunk = \(x) structure(unlist(x), class = "chunked_vector") +x = structure(list(1:4, 5L), class = "chunked_vector") +x +x + 5:1 +lapply(x, tracemem) # trace chunks to verify no re-allocation +z = c(x, x) +z # looks like a plain vector +lapply(z, tracemem) # mem allocation in z are the same from x +str(z) +z = rechunk(z) +str(z) +} +\seealso{ +\code{\link[=DataFrame_n_chunks]{$n_chunks()}} +} +\keyword{DataFrame} diff --git a/man/LazyFrame_unnest.Rd b/man/LazyFrame_unnest.Rd index d8414d15a..0f884bbdb 100644 --- a/man/LazyFrame_unnest.Rd +++ b/man/LazyFrame_unnest.Rd @@ -23,10 +23,10 @@ lf = pl$LazyFrame( b = c("one", "two", "three", "four", "five"), c = 6:10 )$ - select( - pl$col("b")$to_struct(), - pl$col("a", "c")$to_struct()$alias("a_and_c") - ) + select( + pl$col("b")$to_struct(), + pl$col("a", "c")$to_struct()$alias("a_and_c") +) lf$collect() # by default, all struct columns are unnested diff --git a/man/Series_dtype.Rd b/man/Series_dtype.Rd index a29fa59a8..8d591f6e6 100644 --- a/man/Series_dtype.Rd +++ b/man/Series_dtype.Rd @@ -2,30 +2,19 @@ % Please edit documentation in R/series__series.R \name{Series_dtype} \alias{Series_dtype} -\alias{Series_flags} \title{Get data type of Series} \usage{ Series_dtype() - -Series_flags() } \value{ -DataType - DataType } \description{ Get data type of Series - -Get data type of Series -} -\details{ -property sorted flags are not settable, use set_sorted } \examples{ pl$Series(1:4)$dtype pl$Series(c(1, 2))$dtype pl$Series(letters)$dtype -pl$Series(1:4)$sort()$flags } \keyword{Series} diff --git a/man/Series_flags.Rd b/man/Series_flags.Rd new file mode 100644 index 000000000..d909d2c6a --- /dev/null +++ b/man/Series_flags.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/series__series.R +\name{Series_flags} +\alias{Series_flags} +\title{Get data type of Series} +\usage{ +Series_flags() +} +\value{ +DataType +} +\description{ +Get data type of Series +} +\details{ +property sorted flags are not settable, use set_sorted +} +\examples{ +pl$Series(1:4)$sort()$flags +} +\keyword{Series} diff --git a/man/pl_concat.Rd b/man/pl_concat.Rd index 288a30d37..227bebb70 100644 --- a/man/pl_concat.Rd +++ b/man/pl_concat.Rd @@ -4,13 +4,25 @@ \alias{pl_concat} \title{Concat polars objects} \arguments{ -\item{l}{list of DataFrame, or Series, LazyFrame or Expr} +\item{...}{Either individual unpacked args or args wrapped in list(). Args can +be eager as DataFrame, Series and R vectors, or lazy as LazyFrame and Expr. +The first element determines the output of \verb{$concat()}: if the first element +is lazy, a LazyFrame is returned; otherwise, a DataFrame is returned (note +that if the first element is eager, all other elements have to be eager to +avoid implicit collect).} -\item{rechunk}{perform a rechunk at last} +\item{rechunk}{Perform a rechunk at last.} -\item{how}{choice of bind direction "vertical"(rbind) "horizontal"(cbind) "diagonal" diagonally} +\item{how}{Bind direction. Can be "vertical" (like \code{rbind()}), "horizontal" +(like \code{cbind()}), or "diagonal".} -\item{parallel}{BOOL default TRUE, only used for LazyFrames} +\item{parallel}{Only used for LazyFrames. If \code{TRUE} (default), lazy +computations may be executed in parallel.} + +\item{to_supertypes}{If \code{TRUE} (default), cast columns shared super types, if +any. For example, if we try to vertically concatenate two columns of types \code{i32} +and \code{f64}, the column of type \code{i32} will be cast to \code{f64} beforehand. This +argument is equivalent to the "_relaxed" operations in Python polars.} } \value{ DataFrame, or Series, LazyFrame or Expr @@ -19,8 +31,8 @@ DataFrame, or Series, LazyFrame or Expr Concat polars objects } \details{ -Categorical columns/Series must have been constructed while global string cache enabled -\code{\link[=pl_enable_string_cache]{pl$enable_string_cache()}} +Categorical columns/Series must have been constructed while global string +cache enabled. See \code{\link[=pl_enable_string_cache]{pl$enable_string_cache()}}. } \examples{ # vertical @@ -33,7 +45,6 @@ l_ver = lapply(1:10, function(i) { }) pl$concat(l_ver, how = "vertical") - # horizontal l_hor = lapply(1:10, function(i) { l_internal = list( @@ -44,6 +55,14 @@ l_hor = lapply(1:10, function(i) { pl$DataFrame(l_internal) }) pl$concat(l_hor, how = "horizontal") + # diagonal pl$concat(l_hor, how = "diagonal") + +# if two columns don't share the same type, concat() will error unless we use +# `to_supertypes = TRUE`: +test = pl$DataFrame(x = 1L) # i32 +test2 = pl$DataFrame(x = 1.0) #f64 + +pl$concat(test, test2, to_supertypes = TRUE) } diff --git a/src/rust/src/concat.rs b/src/rust/src/concat.rs new file mode 100644 index 000000000..7ac7e873b --- /dev/null +++ b/src/rust/src/concat.rs @@ -0,0 +1,94 @@ +use crate::rdataframe::DataFrame; +use crate::robj_to; + +use crate::rdataframe::LazyFrame; +use crate::rpolarserr::*; +use crate::series::Series; +use extendr_api::prelude::*; +use polars::lazy::dsl; +use polars::prelude as pl; +use polars_core; +use polars_core::functions as pl_functions; +use std::result::Result; + +#[extendr] +fn concat_lf(l: Robj, rechunk: bool, parallel: bool, to_supertypes: bool) -> RResult { + let vlf = robj_to!(Vec, PLLazyFrame, l)?; + dsl::concat( + vlf, + pl::UnionArgs { + parallel, + rechunk, + to_supertypes, + }, + ) + .map_err(polars_to_rpolars_err) + .map(LazyFrame) +} + +#[extendr] +fn diag_concat_lf(l: Robj, rechunk: bool, parallel: bool) -> RResult { + let vlf = robj_to!(Vec, PLLazyFrame, l)?; + dsl::diag_concat_lf(vlf, rechunk, parallel) + .map_err(polars_to_rpolars_err) + .map(LazyFrame) +} + +#[extendr] +pub fn hor_concat_df(l: Robj) -> RResult { + let df_vec = robj_to!(Vec, PLDataFrame, l)?; + pl_functions::hor_concat_df(&df_vec) + .map_err(polars_to_rpolars_err) + .map(DataFrame) +} + +#[extendr] +pub fn concat_series(l: Robj, rechunk: Robj, to_supertypes: Robj) -> RResult { + let to_supertypes = robj_to!(bool, to_supertypes)?; + let mut s_vec = robj_to!(Vec, PLSeries, l)?; + + // find any common supertype and cast to it + if to_supertypes { + let shared_supertype: RResult> = s_vec + .iter() + .map(|s| s.dtype().clone()) + .fold(Ok(None), |acc, x| match acc { + Err(err) => Err(err), + Ok(None) => Ok(Some(x)), // first fold, acc is None, just us x, + Ok(Some(acc)) => polars_core::utils::get_supertype(&acc, &x) + .ok_or(RPolarsErr::new().plain("Series' have no common supertype".to_string())) + .map(|dt| Some(dt)), + }); + let shared_supertype = shared_supertype?.expect("cannot be None, unless empty s_vec"); + + for i in 0..s_vec.len() { + if *s_vec[i].dtype() != shared_supertype { + s_vec[i] = s_vec[i] + .cast(&shared_supertype) + .map_err(polars_to_rpolars_err)?; + }; + } + } + + let mut iter = s_vec.into_iter(); + let mut first_s = iter + .next() + .ok_or(RPolarsErr::new().plain("no series found to concatenate".into()))?; + for next_s in iter { + first_s.append(&next_s).map_err(polars_to_rpolars_err)?; + } + + if robj_to!(bool, rechunk)? { + Ok(first_s.rechunk().into()) + } else { + Ok(first_s.into()) + } +} + +extendr_module! { + mod concat; + fn concat_lf; + fn diag_concat_lf; + fn hor_concat_df; + fn concat_series; +} diff --git a/src/rust/src/lib.rs b/src/rust/src/lib.rs index 35cf71fdf..798b7a84e 100644 --- a/src/rust/src/lib.rs +++ b/src/rust/src/lib.rs @@ -13,6 +13,7 @@ pub mod concurrent; pub mod lazy; pub mod arrow_interop; +pub mod concat; pub mod conversion; pub mod conversion_r_to_s; pub mod conversion_s_to_r; @@ -40,6 +41,8 @@ pub use crate::rbackground::RBGPOOL; // Macro to generate exports extendr_module! { mod polars; + use rlib; + use concat; use rdataframe; use rpolarserr; use rbackground; diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index 63f54048d..4bccc8a87 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -8,9 +8,8 @@ use crate::conversion_r_to_s::robjname2series; use crate::lazy; use crate::rdatatype; use crate::rdatatype::RPolarsDataType; -use crate::rlib; use crate::robj_to; -use crate::rpolarserr::{polars_to_rpolars_err, RResult}; +use crate::rpolarserr::*; pub use lazy::dataframe::*; @@ -86,6 +85,29 @@ impl DataFrame { r!([shp.0, shp.1]) } + pub fn n_chunks(&self, strategy: Robj) -> RResult> { + let nchks: Vec<_> = self.0.iter().map(|s| s.n_chunks() as f64).collect(); + + match robj_to!(str, strategy)? { + "all" => Ok(nchks), + "first" => { + if nchks.is_empty() { + Ok(vec![]) + } else { + Ok(vec![nchks.into_iter().next().expect("has atleast len 1")]) + } + } + _ => { + Err(RPolarsErr::new() + .plain("strategy not recognized, neither 'all' or 'first'".into())) + } + } + } + + pub fn rechunk(&self) -> Self { + self.0.agg_chunks().into() + } + //renamed back to clone pub fn clone_see_me_macro(&self) -> DataFrame { self.clone() @@ -463,7 +485,7 @@ extendr_module! { use read_ipc; use read_parquet; use rdatatype; - use rlib; + impl DataFrame; impl VecDataFrame; } diff --git a/src/rust/src/rlib.rs b/src/rust/src/rlib.rs index 76ab78da4..116c4352f 100644 --- a/src/rust/src/rlib.rs +++ b/src/rust/src/rlib.rs @@ -2,61 +2,14 @@ use crate::lazy::dsl::Expr; use crate::lazy::dsl::ProtoExprArray; use crate::rdataframe::DataFrame; use crate::robj_to; - use crate::rpolarserr::{rdbg, RResult}; use crate::series::Series; -use crate::{rdataframe::VecDataFrame, utils::r_result_list}; use extendr_api::prelude::*; use polars::prelude as pl; use polars_core::functions as pl_functions; use std::result::Result; -#[extendr] -fn concat_df(vdf: &VecDataFrame) -> List { - //-> PyResult { - - use polars_core::error::PolarsResult; - use polars_core::utils::rayon::prelude::*; - - let identity_df = (*vdf.0.iter().peekable().peek().unwrap()) - .clone() - .slice(0, 0); - let rdfs: Vec> = - vdf.0.iter().map(|df| Ok(df.clone())).collect(); - let identity = || Ok(identity_df.clone()); - - let result = polars_core::POOL - .install(|| { - rdfs.into_par_iter() - .fold(identity, |acc: PolarsResult, df| { - let mut acc = acc?; - acc.vstack_mut(&df?)?; - Ok(acc) - }) - .reduce(identity, |acc, df| { - let mut acc = acc?; - acc.vstack_mut(&df?)?; - Ok(acc) - }) - }) - .map(DataFrame); - - r_result_list(result.map_err(|err| format!("{:?}", err))) -} - -#[extendr] -fn diag_concat_df(dfs: &VecDataFrame) -> List { - let df = pl_functions::diag_concat_df(&dfs.0[..]).map(DataFrame); - r_result_list(df.map_err(|err| format!("{:?}", err))) -} - -#[extendr] -pub fn hor_concat_df(dfs: &VecDataFrame) -> List { - let df = pl_functions::hor_concat_df(&dfs.0[..]).map(DataFrame); - r_result_list(df.map_err(|err| format!("{:?}", err))) -} - #[extendr] fn min_exprs(exprs: &ProtoExprArray) -> Expr { let exprs = exprs.to_vec("select"); @@ -278,9 +231,7 @@ fn polars_features() -> List { extendr_module! { mod rlib; - fn concat_df; - fn hor_concat_df; - fn diag_concat_df; + fn min_exprs; fn max_exprs; fn coalesce_exprs; diff --git a/src/rust/src/utils/mod.rs b/src/rust/src/utils/mod.rs index 70dfa5ce3..aa28bb6eb 100644 --- a/src/rust/src/utils/mod.rs +++ b/src/rust/src/utils/mod.rs @@ -3,16 +3,21 @@ pub mod extendr_concurrent; pub mod extendr_helpers; pub mod wrappers; +use crate::conversion_r_to_s::robjname2series; use crate::lazy::dsl::Expr; use crate::rdatatype::RPolarsDataType; -use crate::rpolarserr::{rdbg, rerr, RPolarsErr, RResult, WithRctx}; +use crate::rpolarserr::{polars_to_rpolars_err, rdbg, rerr, RPolarsErr, RResult, WithRctx}; +use crate::series::Series; use extendr_api::prelude::list; use std::any::type_name as tn; //use std::intrinsics::read_via_copy; use crate::lazy::dsl::robj_to_col; +use crate::rdataframe::{DataFrame, LazyFrame}; +use extendr_api::eval_string_with_params; use extendr_api::Attributes; use extendr_api::ExternalPtr; use extendr_api::Result as ExtendrResult; +use extendr_api::R; use polars::prelude as pl; //macro to translate polars NULLs and emulate R NA value of any type @@ -739,13 +744,78 @@ fn internal_rust_wrap_e(robj: Robj, str_to_lit: bool) -> RResult { } } -pub fn robj_to_lazyframe(robj: extendr_api::Robj) -> RResult { +pub fn robj_to_lazyframe(robj: extendr_api::Robj) -> RResult { let robj = unpack_r_result_list(robj)?; let rv = rdbg(&robj); - use crate::rdataframe::LazyFrame; - let res: Result, _> = robj.try_into(); - let ext_ldf = res.bad_val(rv).mistyped(tn::())?; - Ok(LazyFrame(ext_ldf.0.clone())) + + // closure to allow ?-convert extendr::Result to RResult + let res = || -> RResult { + match () { + // allow input as a DataFrame + _ if robj.inherits("DataFrame") => { + let extptr_df: ExternalPtr = robj.try_into()?; + Ok(extptr_df.lazy()) + } + _ if robj.inherits("LazyFrame") => { + let lf: ExternalPtr = robj.try_into()?; + let lf = LazyFrame(lf.0.clone()); + Ok(lf) + } + _ if robj.inherits("data.frame") => { + let df = unpack_r_eval(R!("polars:::result(pl$DataFrame({{robj}}))"))?; + let extptr_df: ExternalPtr = df.try_into()?; + Ok(extptr_df.lazy()) + } + _ => Ok(DataFrame::new_with_capacity(1) + .lazy() + .0 + .select(&[robj_to_rexpr(robj, true)?.0])) + .map(LazyFrame), + } + }(); + + res.bad_val(rv).mistyped(tn::()) +} + +pub fn robj_to_dataframe(robj: extendr_api::Robj) -> RResult { + let robj = unpack_r_result_list(robj)?; + let robj_clone = robj.clone(); + + // closure to allow ?-convert extendr::Result to RResult + let res = || -> RResult { + match () { + // allow input as a DataFrame + _ if robj.inherits("DataFrame") => { + let extptr_df: ExternalPtr = robj.try_into()?; + Ok(extptr_df.0.clone()) + } + _ if robj.inherits("data.frame") => { + let df = unpack_r_eval(R!("polars:::result(pl$DataFrame({{robj}}))"))?; + let extptr_df: ExternalPtr = df.try_into()?; + Ok(extptr_df.0.clone()) + } + _ => DataFrame::new_with_capacity(1) + .lazy() + .0 + .select(&[robj_to_rexpr(robj, true)?.0]) + .collect(), + } + .map(DataFrame) + .map_err(polars_to_rpolars_err) + }(); + + res.bad_val(rdbg(robj_clone)) + .plain("could not be converted into a DataFrame") +} + +pub fn robj_to_series(robj: extendr_api::Robj) -> RResult { + let robj = unpack_r_result_list(robj)?; + let robj_clone = robj.clone(); + robjname2series(robj, "") + .map(Series) + .map_err(polars_to_rpolars_err) + .bad_val(rdbg(robj_clone)) + .plain("could not be converted into a DataFrame") } pub fn list_expr_to_vec_pl_expr( @@ -848,6 +918,14 @@ macro_rules! robj_to_inner { $crate::utils::robj_to_binary_vec($a) }; + (Series, $a:ident) => { + $crate::utils::robj_to_series($a) + }; + + (PLSeries, $a:ident) => { + $crate::utils::robj_to_series($a).map(|ok| ok.0) + }; + (Expr, $a:ident) => { $crate::utils::robj_to_rexpr($a, true) }; @@ -895,6 +973,18 @@ macro_rules! robj_to_inner { $crate::utils::robj_to_lazyframe($a) }; + (PLLazyFrame, $a:ident) => { + $crate::utils::robj_to_lazyframe($a).map(|lf| lf.0) + }; + + (DataFrame, $a:ident) => { + $crate::utils::robj_to_dataframe($a) + }; + + (PLDataFrame, $a:ident) => { + $crate::utils::robj_to_dataframe($a).map(|lf| lf.0) + }; + (RArrow_schema, $a:ident) => { $crate::utils::robj_to_rarrow_schema($a) }; diff --git a/tests/testthat/test-concat.R b/tests/testthat/test-concat.R index f448a692f..90e028d89 100644 --- a/tests/testthat/test-concat.R +++ b/tests/testthat/test-concat.R @@ -1,20 +1,73 @@ test_that("concat dataframe", { - # vertical - l_ver = lapply(1:10, function(i) { + # mixing lazy with first eager not allowed + ctx = pl$concat(pl$DataFrame(mtcars), pl$LazyFrame(mtcars), how = "vertical") |> get_err_ctx() + expect_true(endsWith(ctx$BadArgument, "number 2")) + expect_true(endsWith(ctx$PlainErrorMessage, "avoid implicit collect")) + + ctx = pl$concat(pl$DataFrame(mtcars), mtcars$hp, pl$lit(mtcars$mpg), how = "horizontal") |> + get_err_ctx() + expect_true(endsWith(ctx$BadArgument, "number 3")) + expect_true(endsWith(ctx$PlainErrorMessage, "avoid implicit collect")) + + # mixing eager with first lazy is allowd + df_ref = rbind(mtcars, mtcars) + row.names(df_ref) = 1:64 + expect_identical( + pl$concat(pl$LazyFrame(mtcars), pl$DataFrame(mtcars), how = "vertical")$ + collect()$ + to_data_frame(), + df_ref + ) + + # vertical dfs + l_ver = lapply(1:3, function(i) { l_internal = list( a = 1:5, b = letters[1:5] ) pl$DataFrame(l_internal) }) + df_ver = pl$concat(l_ver, how = "vertical") expect_equal( df_ver$to_data_frame(), do.call(rbind, lapply(l_ver, function(df) df$to_data_frame())) ) + # unpack args allowed + df_ver_2 = pl$concat(l_ver[[1L]], l_ver[[2L]], l_ver[[3L]], how = "vertical") + expect_identical(df_ver$to_list(), df_ver_2$to_list()) + + # use supertypes + expect_identical( + pl$concat(l_ver[[1L]], pl$DataFrame(a = 2, b = 42L), how = "vertical", to_supertypes = TRUE)$to_list(), + pl$DataFrame(rbind(data.frame(a = 1:5, b = letters[1:5]), data.frame(a = 2, b = 42L)))$to_list() + ) + + # type 'relaxed' vertical concatenation is not allowed by default + expect_true( + pl$concat(l_ver[[1L]], pl$DataFrame(a = 2, b = 42L), how = "vertical") |> + get_err_ctx() |> + (\(ctx) ctx$PolarsError)() |> + grepl(pat = "dtypes for column", fixed = TRUE) + ) + + + # check lazy eager is identical + l_ver_lazy = lapply(l_ver, \(df) df$lazy()) + expect_identical( + pl$concat(l_ver_lazy)$collect()$to_list(), + pl$concat(l_ver)$to_list() + ) + + # check rechunk works + expect_identical(pl$concat(mtcars, mtcars, rechunk = TRUE)$n_chunks(), rep(1, 11)) + expect_identical(pl$concat(mtcars, mtcars, rechunk = FALSE)$n_chunks(), rep(2, 11)) + + + # horizontal - l_hor = lapply(1:10, function(i) { + l_hor = lapply(1:5, function(i) { l_internal = list( 1:5, letters[1:5] @@ -28,8 +81,27 @@ test_that("concat dataframe", { do.call(cbind, lapply(l_hor, function(df) df$to_data_frame())) ) - # diagonal + pl$concat(pl$LazyFrame(a = 1:3), how = "horizontal") |> + get_err_ctx("Plain") |> + startsWith("how=='horizontal' is not supported for lazy") |> + expect_true() + + # can concat Series + expect_identical( + pl$concat(1:5, pl$Series(5:1, "b"), how = "horizontal")$to_list(), + list(1:5, b = 5:1) + ) + + + # diagonal eager df_dia = pl$concat(l_hor, how = "diagonal") - expect_equal(df_dia$shape, c(50, 20)) - expect_equal(mean(is.na(df_dia$to_data_frame())), 9 / 10) + expect_equal(df_dia$shape, c(25, 10)) + expect_equal(mean(is.na(df_dia$to_data_frame())), 8 / 10) + + # diagonal lazy + lf_dia = pl$concat(l_hor |> lapply(pl$LazyFrame), how = "diagonal") + expect_identical( + lf_dia$collect()$to_list(), + df_dia$to_list() + ) }) diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index 1c292f7f9..4b159916c 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -158,7 +158,7 @@ test_that("DataFrame, custom schema", { # works fine if a variable is called "schema" expect_no_error( - pl$DataFrame(list(schema = 1), schema = list(schema = pl$Float32)) + pl$DataFrame(list(schema = 1), schema = list(schema = pl$Float32)) ) # errors if incorrect datatype expect_error(pl$DataFrame(x = 1, schema = list(schema = foo))) @@ -808,6 +808,24 @@ test_that("join_asof_simple", { ) }) +test_that("n_chunks", { + df = pl$concat( + 1:10, + pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), + how = "horizontal" + ) + + expect_identical(df$n_chunks(), c(1, 2)) + expect_identical(df$n_chunks("first"), c(1)) + expect_identical(pl$DataFrame()$n_chunks(), numeric()) + expect_identical(pl$DataFrame()$n_chunks("first"), numeric()) + + pl$DataFrame()$n_chunks("wrong strat") |> + get_err_ctx("Plain") |> + grepl(pat = "strategy") |> + expect_true() +}) + test_that("melt example", { df = pl$DataFrame( diff --git a/tests/testthat/test-expr_arr.R b/tests/testthat/test-expr_arr.R index e11138021..fc4d26c31 100644 --- a/tests/testthat/test-expr_arr.R +++ b/tests/testthat/test-expr_arr.R @@ -436,4 +436,3 @@ test_that("eval", { ) ) }) - diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index 8402c786a..b5688e81b 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -774,9 +774,9 @@ test_that("unnest", { df2 = df$ select( - pl$col("a", "b", "c")$to_struct()$alias("first_struct"), - pl$col("d", "e", "f")$to_struct()$alias("second_struct") - ) + pl$col("a", "b", "c")$to_struct()$alias("first_struct"), + pl$col("d", "e", "f")$to_struct()$alias("second_struct") + ) expect_identical( df2$unnest()$collect()$to_data_frame(), @@ -787,9 +787,9 @@ test_that("unnest", { df2$unnest("first_struct")$collect()$to_data_frame(), df$ select( - pl$col("a", "b", "c"), - pl$col("d", "e", "f")$to_struct()$alias("second_struct") - )$ + pl$col("a", "b", "c"), + pl$col("d", "e", "f")$to_struct()$alias("second_struct") + )$ collect()$ to_data_frame() )