From 341c12fa11d4ce4558453f9926c0ad2cdf5f8558 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 3 Oct 2024 22:30:25 +0200 Subject: [PATCH] feat: add `$join_where()` for inequality joins (#1237) Co-authored-by: eitsupi Co-authored-by: etiennebacher --- NEWS.md | 2 + R/dataframe__frame.R | 43 ++ R/extendr-wrappers.R | 2 + R/lazyframe__lazy.R | 56 +++ man/DataFrame_join_where.Rd | 50 +++ man/LazyFrame_join_where.Rd | 50 +++ src/rust/Cargo.toml | 1 + src/rust/src/lazy/dataframe.rs | 16 + tests/testthat/_snaps/after-wrappers.md | 89 ++-- tests/testthat/_snaps/dataframe.new.md | 540 ++++++++++++++++++++++++ tests/testthat/test-dataframe.R | 77 ++++ tests/testthat/test-lazy.R | 77 ++++ 12 files changed, 959 insertions(+), 44 deletions(-) create mode 100644 man/DataFrame_join_where.Rd create mode 100644 man/LazyFrame_join_where.Rd create mode 100644 tests/testthat/_snaps/dataframe.new.md diff --git a/NEWS.md b/NEWS.md index 4e8127e9f..0a9256d67 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,6 +18,8 @@ should trigger an error (#1220). - New method `$to_dummies()` for `DataFrame` (#1225). - New argument `include_file_paths` in `pl_scan_csv()` and `pl_read_csv()` (#1235). +- New method `$join_where()` for `DataFrame` and `LazyFrame` to perform + inequality joins (#1237). ### Bug fixes diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 6e02c2dad..9983a5d8a 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -2577,3 +2577,46 @@ DataFrame_to_dummies = function( .pr$DataFrame$to_dummies(self, columns = columns, separator = separator, drop_first = drop_first) |> unwrap("in $to_dummies():") } + +#' @inherit LazyFrame_join_where title params +#' +#' @description +#' This performs an inner join, so only rows where all predicates are true are +#' included in the result, and a row from either DataFrame may be included +#' multiple times in the result. +#' +#' Note that the row order of the input DataFrames is not preserved. +#' +#' @param other DataFrame to join with. +#' +#' @return A DataFrame +#' +#' @examples +#' east = pl$DataFrame( +#' id = c(100, 101, 102), +#' dur = c(120, 140, 160), +#' rev = c(12, 14, 16), +#' cores = c(2, 8, 4) +#' ) +#' +#' west = pl$DataFrame( +#' t_id = c(404, 498, 676, 742), +#' time = c(90, 130, 150, 170), +#' cost = c(9, 13, 15, 16), +#' cores = c(4, 2, 1, 4) +#' ) +#' +#' east$join_where( +#' west, +#' pl$col("dur") < pl$col("time"), +#' pl$col("rev") < pl$col("cost") +#' ) +DataFrame_join_where = function( + other, + ..., + suffix = "_right") { + if (!is_polars_df(other)) { + Err_plain("`other` must be a DataFrame.") |> unwrap() + } + self$lazy()$join_where(other = other$lazy(), ..., suffix = suffix)$collect() +} diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index ff7c21b05..b703cc9fc 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -1258,6 +1258,8 @@ RPolarsLazyFrame$join_asof <- function(other, left_on, right_on, left_by, right_ RPolarsLazyFrame$join <- function(other, left_on, right_on, how, validate, join_nulls, suffix, allow_parallel, force_parallel, coalesce) .Call(wrap__RPolarsLazyFrame__join, self, other, left_on, right_on, how, validate, join_nulls, suffix, allow_parallel, force_parallel, coalesce) +RPolarsLazyFrame$join_where <- function(other, predicates, suffix) .Call(wrap__RPolarsLazyFrame__join_where, self, other, predicates, suffix) + RPolarsLazyFrame$sort_by_exprs <- function(by, dotdotdot, descending, nulls_last, maintain_order, multithreaded) .Call(wrap__RPolarsLazyFrame__sort_by_exprs, self, by, dotdotdot, descending, nulls_last, maintain_order, multithreaded) RPolarsLazyFrame$unpivot <- function(on, index, value_name, variable_name) .Call(wrap__RPolarsLazyFrame__unpivot, self, on, index, value_name, variable_name) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 35ab42575..e7d10eb5a 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1367,6 +1367,62 @@ LazyFrame_join = function( uw() } +#' Perform a join based on one or multiple (in)equality predicates +#' +#' @description +#' This performs an inner join, so only rows where all predicates are true are +#' included in the result, and a row from either LazyFrame may be included +#' multiple times in the result. +#' +#' Note that the row order of the input LazyFrames is not preserved. +#' +#' @param other LazyFrame to join with. +#' @param ... (In)Equality condition to join the two tables on. When a column +#' name occurs in both tables, the proper suffix must be applied in the +#' predicate. For example, if both tables have a column `"x"` that you want to +#' use in the conditions, you must refer to the column of the right table as +#' `"x"`. +#' @param suffix Suffix to append to columns with a duplicate name. +#' +#' @return A LazyFrame +#' +#' @examples +#' east = pl$LazyFrame( +#' id = c(100, 101, 102), +#' dur = c(120, 140, 160), +#' rev = c(12, 14, 16), +#' cores = c(2, 8, 4) +#' ) +#' +#' west = pl$LazyFrame( +#' t_id = c(404, 498, 676, 742), +#' time = c(90, 130, 150, 170), +#' cost = c(9, 13, 15, 16), +#' cores = c(4, 2, 1, 4) +#' ) +#' +#' east$join_where( +#' west, +#' pl$col("dur") < pl$col("time"), +#' pl$col("rev") < pl$col("cost") +#' )$collect() +LazyFrame_join_where = function( + other, + ..., + suffix = "_right") { + uw = \(res) unwrap(res, "in $join_where():") + + if (!is_polars_lf(other)) { + Err_plain("`other` must be a LazyFrame.") |> uw() + } + + .pr$LazyFrame$join_where( + self, other, unpack_list(..., .context = "in $join_where():"), suffix + ) |> + uw() +} + + #' Sort the LazyFrame by the given columns #' diff --git a/man/DataFrame_join_where.Rd b/man/DataFrame_join_where.Rd new file mode 100644 index 000000000..0281041e7 --- /dev/null +++ b/man/DataFrame_join_where.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe__frame.R +\name{DataFrame_join_where} +\alias{DataFrame_join_where} +\title{Perform a join based on one or multiple (in)equality predicates} +\usage{ +DataFrame_join_where(other, ..., suffix = "_right") +} +\arguments{ +\item{other}{DataFrame to join with.} + +\item{...}{(In)Equality condition to join the two tables on. When a column +name occurs in both tables, the proper suffix must be applied in the +predicate. For example, if both tables have a column \code{"x"} that you want to +use in the conditions, you must refer to the column of the right table as +\code{"x"}.} + +\item{suffix}{Suffix to append to columns with a duplicate name.} +} +\value{ +A DataFrame +} +\description{ +This performs an inner join, so only rows where all predicates are true are +included in the result, and a row from either DataFrame may be included +multiple times in the result. + +Note that the row order of the input DataFrames is not preserved. +} +\examples{ +east = pl$DataFrame( + id = c(100, 101, 102), + dur = c(120, 140, 160), + rev = c(12, 14, 16), + cores = c(2, 8, 4) +) + +west = pl$DataFrame( + t_id = c(404, 498, 676, 742), + time = c(90, 130, 150, 170), + cost = c(9, 13, 15, 16), + cores = c(4, 2, 1, 4) +) + +east$join_where( + west, + pl$col("dur") < pl$col("time"), + pl$col("rev") < pl$col("cost") +) +} diff --git a/man/LazyFrame_join_where.Rd b/man/LazyFrame_join_where.Rd new file mode 100644 index 000000000..f8b5db3c2 --- /dev/null +++ b/man/LazyFrame_join_where.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe__lazy.R +\name{LazyFrame_join_where} +\alias{LazyFrame_join_where} +\title{Perform a join based on one or multiple (in)equality predicates} +\usage{ +LazyFrame_join_where(other, ..., suffix = "_right") +} +\arguments{ +\item{other}{LazyFrame to join with.} + +\item{...}{(In)Equality condition to join the two tables on. When a column +name occurs in both tables, the proper suffix must be applied in the +predicate. For example, if both tables have a column \code{"x"} that you want to +use in the conditions, you must refer to the column of the right table as +\code{"x"}.} + +\item{suffix}{Suffix to append to columns with a duplicate name.} +} +\value{ +A LazyFrame +} +\description{ +This performs an inner join, so only rows where all predicates are true are +included in the result, and a row from either LazyFrame may be included +multiple times in the result. + +Note that the row order of the input LazyFrames is not preserved. +} +\examples{ +east = pl$LazyFrame( + id = c(100, 101, 102), + dur = c(120, 140, 160), + rev = c(12, 14, 16), + cores = c(2, 8, 4) +) + +west = pl$LazyFrame( + t_id = c(404, 498, 676, 742), + time = c(90, 130, 150, 170), + cost = c(9, 13, 15, 16), + cores = c(4, 2, 1, 4) +) + +east$join_where( + west, + pl$col("dur") < pl$col("time"), + pl$col("rev") < pl$col("cost") +)$collect() +} diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index 468d37e2a..40ca822bf 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -97,6 +97,7 @@ features = [ "fmt", "gcp", "http", + "iejoin", "interpolate", "ipc", "is_between", diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index 04857e68a..40c547b02 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -478,6 +478,22 @@ impl RPolarsLazyFrame { )) } + fn join_where(&self, other: Robj, predicates: Robj, suffix: Robj) -> RResult { + let ldf = self.0.clone(); + let other = robj_to!(PLLazyFrame, other)?; + let predicates = robj_to!(VecPLExprColNamed, predicates)?; + let suffix = robj_to!(str, suffix)?; + + let out = ldf + .join_builder() + .with(other) + .suffix(suffix) + .join_where(predicates) + .into(); + + Ok(out) + } + pub fn sort_by_exprs( &self, by: Robj, diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index 86567cbb7..ebf49d224 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -84,19 +84,19 @@ [17] "first" "flags" "gather_every" "get_column" [21] "get_columns" "glimpse" "group_by" "group_by_dynamic" [25] "head" "height" "item" "join" - [29] "join_asof" "last" "lazy" "limit" - [33] "max" "mean" "median" "min" - [37] "n_chunks" "null_count" "partition_by" "pivot" - [41] "print" "quantile" "rechunk" "rename" - [45] "reverse" "rolling" "sample" "schema" - [49] "select" "select_seq" "shape" "shift" - [53] "slice" "sort" "sql" "std" - [57] "sum" "tail" "to_data_frame" "to_dummies" - [61] "to_list" "to_raw_ipc" "to_series" "to_struct" - [65] "transpose" "unique" "unnest" "unpivot" - [69] "var" "width" "with_columns" "with_columns_seq" - [73] "with_row_index" "write_csv" "write_ipc" "write_json" - [77] "write_ndjson" "write_parquet" + [29] "join_asof" "join_where" "last" "lazy" + [33] "limit" "max" "mean" "median" + [37] "min" "n_chunks" "null_count" "partition_by" + [41] "pivot" "print" "quantile" "rechunk" + [45] "rename" "reverse" "rolling" "sample" + [49] "schema" "select" "select_seq" "shape" + [53] "shift" "slice" "sort" "sql" + [57] "std" "sum" "tail" "to_data_frame" + [61] "to_dummies" "to_list" "to_raw_ipc" "to_series" + [65] "to_struct" "transpose" "unique" "unnest" + [69] "unpivot" "var" "width" "with_columns" + [73] "with_columns_seq" "with_row_index" "write_csv" "write_ipc" + [77] "write_json" "write_ndjson" "write_parquet" --- @@ -150,19 +150,19 @@ [13] "fill_nan" "fill_null" "filter" [16] "first" "gather_every" "group_by" [19] "group_by_dynamic" "head" "join" - [22] "join_asof" "last" "limit" - [25] "max" "mean" "median" - [28] "min" "print" "profile" - [31] "quantile" "rename" "reverse" - [34] "rolling" "schema" "select" - [37] "select_seq" "serialize" "shift" - [40] "sink_csv" "sink_ipc" "sink_ndjson" - [43] "sink_parquet" "slice" "sort" - [46] "sql" "std" "sum" - [49] "tail" "to_dot" "unique" - [52] "unnest" "unpivot" "var" - [55] "width" "with_columns" "with_columns_seq" - [58] "with_context" "with_row_index" + [22] "join_asof" "join_where" "last" + [25] "limit" "max" "mean" + [28] "median" "min" "print" + [31] "profile" "quantile" "rename" + [34] "reverse" "rolling" "schema" + [37] "select" "select_seq" "serialize" + [40] "shift" "sink_csv" "sink_ipc" + [43] "sink_ndjson" "sink_parquet" "slice" + [46] "sort" "sql" "std" + [49] "sum" "tail" "to_dot" + [52] "unique" "unnest" "unpivot" + [55] "var" "width" "with_columns" + [58] "with_columns_seq" "with_context" "with_row_index" --- @@ -180,24 +180,25 @@ [17] "fill_null" "filter" [19] "first" "group_by" [21] "group_by_dynamic" "join" - [23] "join_asof" "last" - [25] "max" "mean" - [27] "median" "min" - [29] "optimization_toggle" "print" - [31] "profile" "quantile" - [33] "rename" "reverse" - [35] "rolling" "schema" - [37] "select" "select_seq" - [39] "serialize" "shift" - [41] "sink_csv" "sink_ipc" - [43] "sink_json" "sink_parquet" - [45] "slice" "sort_by_exprs" - [47] "std" "sum" - [49] "tail" "to_dot" - [51] "unique" "unnest" - [53] "unpivot" "var" - [55] "with_columns" "with_columns_seq" - [57] "with_context" "with_row_index" + [23] "join_asof" "join_where" + [25] "last" "max" + [27] "mean" "median" + [29] "min" "optimization_toggle" + [31] "print" "profile" + [33] "quantile" "rename" + [35] "reverse" "rolling" + [37] "schema" "select" + [39] "select_seq" "serialize" + [41] "shift" "sink_csv" + [43] "sink_ipc" "sink_json" + [45] "sink_parquet" "slice" + [47] "sort_by_exprs" "std" + [49] "sum" "tail" + [51] "to_dot" "unique" + [53] "unnest" "unpivot" + [55] "var" "with_columns" + [57] "with_columns_seq" "with_context" + [59] "with_row_index" # public and private methods of each class Expr diff --git a/tests/testthat/_snaps/dataframe.new.md b/tests/testthat/_snaps/dataframe.new.md new file mode 100644 index 000000000..fc1c14e07 --- /dev/null +++ b/tests/testthat/_snaps/dataframe.new.md @@ -0,0 +1,540 @@ +# DataFrame, mixed input, create and print .name=dummy, .value=dummy + + Code + df + Output + shape: (5, 6) + ┌─────────┬──────┬─────┬────────────┬──────────────┬──────────────┐ + │ newname ┆ a ┆ b ┆ new_column ┆ named_vector ┆ new_column_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 ┆ f64 ┆ f64 │ + ╞═════════╪══════╪═════╪════════════╪══════════════╪══════════════╡ + │ 1.0 ┆ 5.0 ┆ a ┆ 5.0 ┆ 15.0 ┆ 5.0 │ + │ 2.0 ┆ 10.0 ┆ b ┆ 4.0 ┆ 14.0 ┆ 4.0 │ + │ 3.0 ┆ 15.0 ┆ c ┆ 3.0 ┆ 13.0 ┆ 3.0 │ + │ 4.0 ┆ 20.0 ┆ d ┆ 2.0 ┆ 12.0 ┆ 2.0 │ + │ 5.0 ┆ 25.0 ┆ e ┆ 1.0 ┆ 11.0 ┆ 0.0 │ + └─────────┴──────┴─────┴────────────┴──────────────┴──────────────┘ + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_CELL_ALIGNMENT, .value=RIGHT + + Code + df + Output + shape: (5, 6) + ┌─────────┬──────┬─────┬────────────┬──────────────┬──────────────┐ + │ newname ┆ a ┆ b ┆ new_column ┆ named_vector ┆ new_column_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 ┆ f64 ┆ f64 │ + ╞═════════╪══════╪═════╪════════════╪══════════════╪══════════════╡ + │ 1.0 ┆ 5.0 ┆ a ┆ 5.0 ┆ 15.0 ┆ 5.0 │ + │ 2.0 ┆ 10.0 ┆ b ┆ 4.0 ┆ 14.0 ┆ 4.0 │ + │ 3.0 ┆ 15.0 ┆ c ┆ 3.0 ┆ 13.0 ┆ 3.0 │ + │ 4.0 ┆ 20.0 ┆ d ┆ 2.0 ┆ 12.0 ┆ 2.0 │ + │ 5.0 ┆ 25.0 ┆ e ┆ 1.0 ┆ 11.0 ┆ 0.0 │ + └─────────┴──────┴─────┴────────────┴──────────────┴──────────────┘ + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_DATAFRAME_SHAPE_BELOW, .value=1 + + Code + df + Output + ┌─────────┬──────┬─────┬────────────┬──────────────┬──────────────┐ + │ newname ┆ a ┆ b ┆ new_column ┆ named_vector ┆ new_column_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 ┆ f64 ┆ f64 │ + ╞═════════╪══════╪═════╪════════════╪══════════════╪══════════════╡ + │ 1.0 ┆ 5.0 ┆ a ┆ 5.0 ┆ 15.0 ┆ 5.0 │ + │ 2.0 ┆ 10.0 ┆ b ┆ 4.0 ┆ 14.0 ┆ 4.0 │ + │ 3.0 ┆ 15.0 ┆ c ┆ 3.0 ┆ 13.0 ┆ 3.0 │ + │ 4.0 ┆ 20.0 ┆ d ┆ 2.0 ┆ 12.0 ┆ 2.0 │ + │ 5.0 ┆ 25.0 ┆ e ┆ 1.0 ┆ 11.0 ┆ 0.0 │ + └─────────┴──────┴─────┴────────────┴──────────────┴──────────────┘ + shape: (5, 6) + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_FORMATTING, .value=ASCII_FULL + + Code + df + Output + shape: (5, 6) + +---------+------+-----+------------+--------------+--------------+ + | newname | a | b | new_column | named_vector | new_column_1 | + | --- | --- | --- | --- | --- | --- | + | f64 | f64 | str | f64 | f64 | f64 | + +=================================================================+ + | 1.0 | 5.0 | a | 5.0 | 15.0 | 5.0 | + |---------+------+-----+------------+--------------+--------------| + | 2.0 | 10.0 | b | 4.0 | 14.0 | 4.0 | + |---------+------+-----+------------+--------------+--------------| + | 3.0 | 15.0 | c | 3.0 | 13.0 | 3.0 | + |---------+------+-----+------------+--------------+--------------| + | 4.0 | 20.0 | d | 2.0 | 12.0 | 2.0 | + |---------+------+-----+------------+--------------+--------------| + | 5.0 | 25.0 | e | 1.0 | 11.0 | 0.0 | + +---------+------+-----+------------+--------------+--------------+ + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_FORMATTING, .value=ASCII_FULL_CONDENSED + + Code + df + Output + shape: (5, 6) + +---------+------+-----+------------+--------------+--------------+ + | newname | a | b | new_column | named_vector | new_column_1 | + | --- | --- | --- | --- | --- | --- | + | f64 | f64 | str | f64 | f64 | f64 | + +=================================================================+ + | 1.0 | 5.0 | a | 5.0 | 15.0 | 5.0 | + | 2.0 | 10.0 | b | 4.0 | 14.0 | 4.0 | + | 3.0 | 15.0 | c | 3.0 | 13.0 | 3.0 | + | 4.0 | 20.0 | d | 2.0 | 12.0 | 2.0 | + | 5.0 | 25.0 | e | 1.0 | 11.0 | 0.0 | + +---------+------+-----+------------+--------------+--------------+ + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_FORMATTING, .value=ASCII_NO_BORDERS + + Code + df + Output + shape: (5, 6) + newname | a | b | new_column | named_vector | new_column_1 + --- | --- | --- | --- | --- | --- + f64 | f64 | str | f64 | f64 | f64 + ================================================================= + 1.0 | 5.0 | a | 5.0 | 15.0 | 5.0 + ---------+------+-----+------------+--------------+-------------- + 2.0 | 10.0 | b | 4.0 | 14.0 | 4.0 + ---------+------+-----+------------+--------------+-------------- + 3.0 | 15.0 | c | 3.0 | 13.0 | 3.0 + ---------+------+-----+------------+--------------+-------------- + 4.0 | 20.0 | d | 2.0 | 12.0 | 2.0 + ---------+------+-----+------------+--------------+-------------- + 5.0 | 25.0 | e | 1.0 | 11.0 | 0.0 + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_FORMATTING, .value=ASCII_BORDERS_ONLY + + Code + df + Output + shape: (5, 6) + +-----------------------------------------------------------------+ + | newname a b new_column named_vector new_column_1 | + | --- --- --- --- --- --- | + | f64 f64 str f64 f64 f64 | + +=================================================================+ + | 1.0 5.0 a 5.0 15.0 5.0 | + | | + | 2.0 10.0 b 4.0 14.0 4.0 | + | | + | 3.0 15.0 c 3.0 13.0 3.0 | + | | + | 4.0 20.0 d 2.0 12.0 2.0 | + | | + | 5.0 25.0 e 1.0 11.0 0.0 | + +-----------------------------------------------------------------+ + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_FORMATTING, .value=ASCII_BORDERS_ONLY_CONDENSED + + Code + df + Output + shape: (5, 6) + +-----------------------------------------------------------------+ + | newname a b new_column named_vector new_column_1 | + | --- --- --- --- --- --- | + | f64 f64 str f64 f64 f64 | + +=================================================================+ + | 1.0 5.0 a 5.0 15.0 5.0 | + | 2.0 10.0 b 4.0 14.0 4.0 | + | 3.0 15.0 c 3.0 13.0 3.0 | + | 4.0 20.0 d 2.0 12.0 2.0 | + | 5.0 25.0 e 1.0 11.0 0.0 | + +-----------------------------------------------------------------+ + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_FORMATTING, .value=ASCII_HORIZONTAL_ONLY + + Code + df + Output + shape: (5, 6) + ----------------------------------------------------------------- + newname a b new_column named_vector new_column_1 + --- --- --- --- --- --- + f64 f64 str f64 f64 f64 + ================================================================= + 1.0 5.0 a 5.0 15.0 5.0 + ----------------------------------------------------------------- + 2.0 10.0 b 4.0 14.0 4.0 + ----------------------------------------------------------------- + 3.0 15.0 c 3.0 13.0 3.0 + ----------------------------------------------------------------- + 4.0 20.0 d 2.0 12.0 2.0 + ----------------------------------------------------------------- + 5.0 25.0 e 1.0 11.0 0.0 + ----------------------------------------------------------------- + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_FORMATTING, .value=ASCII_MARKDOWN + + Code + df + Output + shape: (5, 6) + | newname | a | b | new_column | named_vector | new_column_1 | + | --- | --- | --- | --- | --- | --- | + | f64 | f64 | str | f64 | f64 | f64 | + |---------|------|-----|------------|--------------|--------------| + | 1.0 | 5.0 | a | 5.0 | 15.0 | 5.0 | + | 2.0 | 10.0 | b | 4.0 | 14.0 | 4.0 | + | 3.0 | 15.0 | c | 3.0 | 13.0 | 3.0 | + | 4.0 | 20.0 | d | 2.0 | 12.0 | 2.0 | + | 5.0 | 25.0 | e | 1.0 | 11.0 | 0.0 | + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_FORMATTING, .value=UTF8_FULL + + Code + df + Output + shape: (5, 6) + ┌─────────┬──────┬─────┬────────────┬──────────────┬──────────────┐ + │ newname ┆ a ┆ b ┆ new_column ┆ named_vector ┆ new_column_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 ┆ f64 ┆ f64 │ + ╞═════════╪══════╪═════╪════════════╪══════════════╪══════════════╡ + │ 1.0 ┆ 5.0 ┆ a ┆ 5.0 ┆ 15.0 ┆ 5.0 │ + ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 2.0 ┆ 10.0 ┆ b ┆ 4.0 ┆ 14.0 ┆ 4.0 │ + ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 3.0 ┆ 15.0 ┆ c ┆ 3.0 ┆ 13.0 ┆ 3.0 │ + ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 4.0 ┆ 20.0 ┆ d ┆ 2.0 ┆ 12.0 ┆ 2.0 │ + ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 5.0 ┆ 25.0 ┆ e ┆ 1.0 ┆ 11.0 ┆ 0.0 │ + └─────────┴──────┴─────┴────────────┴──────────────┴──────────────┘ + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_FORMATTING, .value=UTF8_FULL_CONDENSED + + Code + df + Output + shape: (5, 6) + ┌─────────┬──────┬─────┬────────────┬──────────────┬──────────────┐ + │ newname ┆ a ┆ b ┆ new_column ┆ named_vector ┆ new_column_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 ┆ f64 ┆ f64 │ + ╞═════════╪══════╪═════╪════════════╪══════════════╪══════════════╡ + │ 1.0 ┆ 5.0 ┆ a ┆ 5.0 ┆ 15.0 ┆ 5.0 │ + │ 2.0 ┆ 10.0 ┆ b ┆ 4.0 ┆ 14.0 ┆ 4.0 │ + │ 3.0 ┆ 15.0 ┆ c ┆ 3.0 ┆ 13.0 ┆ 3.0 │ + │ 4.0 ┆ 20.0 ┆ d ┆ 2.0 ┆ 12.0 ┆ 2.0 │ + │ 5.0 ┆ 25.0 ┆ e ┆ 1.0 ┆ 11.0 ┆ 0.0 │ + └─────────┴──────┴─────┴────────────┴──────────────┴──────────────┘ + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_FORMATTING, .value=UTF8_NO_BORDERS + + Code + df + Output + shape: (5, 6) + newname ┆ a ┆ b ┆ new_column ┆ named_vector ┆ new_column_1 + --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- + f64 ┆ f64 ┆ str ┆ f64 ┆ f64 ┆ f64 + ═════════╪══════╪═════╪════════════╪══════════════╪══════════════ + 1.0 ┆ 5.0 ┆ a ┆ 5.0 ┆ 15.0 ┆ 5.0 + ╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌ + 2.0 ┆ 10.0 ┆ b ┆ 4.0 ┆ 14.0 ┆ 4.0 + ╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌ + 3.0 ┆ 15.0 ┆ c ┆ 3.0 ┆ 13.0 ┆ 3.0 + ╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌ + 4.0 ┆ 20.0 ┆ d ┆ 2.0 ┆ 12.0 ┆ 2.0 + ╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌ + 5.0 ┆ 25.0 ┆ e ┆ 1.0 ┆ 11.0 ┆ 0.0 + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_FORMATTING, .value=UTF8_BORDERS_ONLY + + Code + df + Output + shape: (5, 6) + ┌─────────────────────────────────────────────────────────────────┐ + │ newname a b new_column named_vector new_column_1 │ + │ --- --- --- --- --- --- │ + │ f64 f64 str f64 f64 f64 │ + ╞═════════════════════════════════════════════════════════════════╡ + │ 1.0 5.0 a 5.0 15.0 5.0 │ + │ 2.0 10.0 b 4.0 14.0 4.0 │ + │ 3.0 15.0 c 3.0 13.0 3.0 │ + │ 4.0 20.0 d 2.0 12.0 2.0 │ + │ 5.0 25.0 e 1.0 11.0 0.0 │ + └─────────────────────────────────────────────────────────────────┘ + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_FORMATTING, .value=UTF8_HORIZONTAL_ONLY + + Code + df + Output + shape: (5, 6) + ───────────────────────────────────────────────────────────────── + newname a b new_column named_vector new_column_1 + --- --- --- --- --- --- + f64 f64 str f64 f64 f64 + ═════════════════════════════════════════════════════════════════ + 1.0 5.0 a 5.0 15.0 5.0 + ───────────────────────────────────────────────────────────────── + 2.0 10.0 b 4.0 14.0 4.0 + ───────────────────────────────────────────────────────────────── + 3.0 15.0 c 3.0 13.0 3.0 + ───────────────────────────────────────────────────────────────── + 4.0 20.0 d 2.0 12.0 2.0 + ───────────────────────────────────────────────────────────────── + 5.0 25.0 e 1.0 11.0 0.0 + ───────────────────────────────────────────────────────────────── + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_FORMATTING, .value=NOTHING + + Code + df + Output + shape: (5, 6) + newname a b new_column named_vector new_column_1 + --- --- --- --- --- --- + f64 f64 str f64 f64 f64 + 1.0 5.0 a 5.0 15.0 5.0 + 2.0 10.0 b 4.0 14.0 4.0 + 3.0 15.0 c 3.0 13.0 3.0 + 4.0 20.0 d 2.0 12.0 2.0 + 5.0 25.0 e 1.0 11.0 0.0 + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_HIDE_COLUMN_DATA_TYPES, .value=1 + + Code + df + Output + shape: (5, 6) + ┌─────────┬──────┬───┬────────────┬──────────────┬──────────────┐ + │ newname ┆ a ┆ b ┆ new_column ┆ named_vector ┆ new_column_1 │ + ╞═════════╪══════╪═══╪════════════╪══════════════╪══════════════╡ + │ 1.0 ┆ 5.0 ┆ a ┆ 5.0 ┆ 15.0 ┆ 5.0 │ + │ 2.0 ┆ 10.0 ┆ b ┆ 4.0 ┆ 14.0 ┆ 4.0 │ + │ 3.0 ┆ 15.0 ┆ c ┆ 3.0 ┆ 13.0 ┆ 3.0 │ + │ 4.0 ┆ 20.0 ┆ d ┆ 2.0 ┆ 12.0 ┆ 2.0 │ + │ 5.0 ┆ 25.0 ┆ e ┆ 1.0 ┆ 11.0 ┆ 0.0 │ + └─────────┴──────┴───┴────────────┴──────────────┴──────────────┘ + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_HIDE_COLUMN_NAMES, .value=1 + + Code + df + Output + shape: (5, 6) + ┌─────┬──────┬─────┬─────┬──────┬─────┐ + │ f64 ┆ f64 ┆ str ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪═════╪═════╪══════╪═════╡ + │ 1.0 ┆ 5.0 ┆ a ┆ 5.0 ┆ 15.0 ┆ 5.0 │ + │ 2.0 ┆ 10.0 ┆ b ┆ 4.0 ┆ 14.0 ┆ 4.0 │ + │ 3.0 ┆ 15.0 ┆ c ┆ 3.0 ┆ 13.0 ┆ 3.0 │ + │ 4.0 ┆ 20.0 ┆ d ┆ 2.0 ┆ 12.0 ┆ 2.0 │ + │ 5.0 ┆ 25.0 ┆ e ┆ 1.0 ┆ 11.0 ┆ 0.0 │ + └─────┴──────┴─────┴─────┴──────┴─────┘ + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_HIDE_COLUMN_SEPARATOR, .value=1 + + Code + df + Output + shape: (5, 6) + ┌─────────┬──────┬─────┬────────────┬──────────────┬──────────────┐ + │ newname ┆ a ┆ b ┆ new_column ┆ named_vector ┆ new_column_1 │ + │ f64 ┆ f64 ┆ str ┆ f64 ┆ f64 ┆ f64 │ + ╞═════════╪══════╪═════╪════════════╪══════════════╪══════════════╡ + │ 1.0 ┆ 5.0 ┆ a ┆ 5.0 ┆ 15.0 ┆ 5.0 │ + │ 2.0 ┆ 10.0 ┆ b ┆ 4.0 ┆ 14.0 ┆ 4.0 │ + │ 3.0 ┆ 15.0 ┆ c ┆ 3.0 ┆ 13.0 ┆ 3.0 │ + │ 4.0 ┆ 20.0 ┆ d ┆ 2.0 ┆ 12.0 ┆ 2.0 │ + │ 5.0 ┆ 25.0 ┆ e ┆ 1.0 ┆ 11.0 ┆ 0.0 │ + └─────────┴──────┴─────┴────────────┴──────────────┴──────────────┘ + +# DataFrame, mixed input, create and print .name=POLARS_FMT_TABLE_HIDE_DATAFRAME_SHAPE_INFORMATION, .value=1 + + Code + df + Output + ┌─────────┬──────┬─────┬────────────┬──────────────┬──────────────┐ + │ newname ┆ a ┆ b ┆ new_column ┆ named_vector ┆ new_column_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 ┆ f64 ┆ f64 │ + ╞═════════╪══════╪═════╪════════════╪══════════════╪══════════════╡ + │ 1.0 ┆ 5.0 ┆ a ┆ 5.0 ┆ 15.0 ┆ 5.0 │ + │ 2.0 ┆ 10.0 ┆ b ┆ 4.0 ┆ 14.0 ┆ 4.0 │ + │ 3.0 ┆ 15.0 ┆ c ┆ 3.0 ┆ 13.0 ┆ 3.0 │ + │ 4.0 ┆ 20.0 ┆ d ┆ 2.0 ┆ 12.0 ┆ 2.0 │ + │ 5.0 ┆ 25.0 ┆ e ┆ 1.0 ┆ 11.0 ┆ 0.0 │ + └─────────┴──────┴─────┴────────────┴──────────────┴──────────────┘ + +# DataFrame, mixed input, create and print .name=POLARS_FMT_MAX_ROWS, .value=2 + + Code + df + Output + shape: (5, 6) + ┌─────────┬──────┬─────┬────────────┬──────────────┬──────────────┐ + │ newname ┆ a ┆ b ┆ new_column ┆ named_vector ┆ new_column_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 ┆ f64 ┆ f64 │ + ╞═════════╪══════╪═════╪════════════╪══════════════╪══════════════╡ + │ 1.0 ┆ 5.0 ┆ a ┆ 5.0 ┆ 15.0 ┆ 5.0 │ + │ … ┆ … ┆ … ┆ … ┆ … ┆ … │ + │ 5.0 ┆ 25.0 ┆ e ┆ 1.0 ┆ 11.0 ┆ 0.0 │ + └─────────┴──────┴─────┴────────────┴──────────────┴──────────────┘ + +# describe + + Code + df$describe() + Output + shape: (9, 5) + ┌────────────┬────────┬────────────┬──────┬───────┐ + │ statistic ┆ string ┆ date ┆ cat ┆ bool │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str ┆ str │ + ╞════════════╪════════╪════════════╪══════╪═══════╡ + │ count ┆ 2 ┆ 2 ┆ 2 ┆ 2 │ + │ null_count ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + │ mean ┆ null ┆ null ┆ null ┆ null │ + │ std ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ a ┆ 2024-01-20 ┆ zz ┆ false │ + │ 25% ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ b ┆ 2024-01-21 ┆ a ┆ true │ + └────────────┴────────┴────────────┴──────┴───────┘ + +--- + + Code + pl$DataFrame(mtcars)$describe() + Output + shape: (9, 12) + ┌────────┬───────┬───────┬───────┬───┬───────┬───────┬───────┬───────┐ + │ statis ┆ mpg ┆ cyl ┆ disp ┆ … ┆ vs ┆ am ┆ gear ┆ carb │ + │ tic ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ + │ --- ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ + │ str ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │ + ╞════════╪═══════╪═══════╪═══════╪═══╪═══════╪═══════╪═══════╪═══════╡ + │ count ┆ 32.0 ┆ 32.0 ┆ 32.0 ┆ … ┆ 32.0 ┆ 32.0 ┆ 32.0 ┆ 32.0 │ + │ null_c ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ + │ ount ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │ + │ mean ┆ 20.09 ┆ 6.187 ┆ 230.7 ┆ … ┆ 0.437 ┆ 0.406 ┆ 3.687 ┆ 2.812 │ + │ ┆ 0625 ┆ 5 ┆ 21875 ┆ ┆ 5 ┆ 25 ┆ 5 ┆ 5 │ + │ std ┆ 6.026 ┆ 1.785 ┆ 123.9 ┆ … ┆ 0.504 ┆ 0.498 ┆ 0.737 ┆ 1.615 │ + │ ┆ 948 ┆ 922 ┆ 38694 ┆ ┆ 016 ┆ 991 ┆ 804 ┆ 2 │ + │ min ┆ 10.4 ┆ 4.0 ┆ 71.1 ┆ … ┆ 0.0 ┆ 0.0 ┆ 3.0 ┆ 1.0 │ + │ 25% ┆ 15.5 ┆ 4.0 ┆ 121.0 ┆ … ┆ 0.0 ┆ 0.0 ┆ 3.0 ┆ 2.0 │ + │ 50% ┆ 19.2 ┆ 6.0 ┆ 225.0 ┆ … ┆ 0.0 ┆ 0.0 ┆ 4.0 ┆ 2.0 │ + │ 75% ┆ 22.8 ┆ 8.0 ┆ 318.0 ┆ … ┆ 1.0 ┆ 1.0 ┆ 4.0 ┆ 4.0 │ + │ max ┆ 33.9 ┆ 8.0 ┆ 472.0 ┆ … ┆ 1.0 ┆ 1.0 ┆ 5.0 ┆ 8.0 │ + └────────┴───────┴───────┴───────┴───┴───────┴───────┴───────┴───────┘ + +--- + + Code + pl$DataFrame(mtcars)$describe(interpolation = "linear") + Output + shape: (9, 12) + ┌────────┬───────┬───────┬───────┬───┬───────┬───────┬───────┬───────┐ + │ statis ┆ mpg ┆ cyl ┆ disp ┆ … ┆ vs ┆ am ┆ gear ┆ carb │ + │ tic ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ + │ --- ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ + │ str ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │ + ╞════════╪═══════╪═══════╪═══════╪═══╪═══════╪═══════╪═══════╪═══════╡ + │ count ┆ 32.0 ┆ 32.0 ┆ 32.0 ┆ … ┆ 32.0 ┆ 32.0 ┆ 32.0 ┆ 32.0 │ + │ null_c ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ + │ ount ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │ + │ mean ┆ 20.09 ┆ 6.187 ┆ 230.7 ┆ … ┆ 0.437 ┆ 0.406 ┆ 3.687 ┆ 2.812 │ + │ ┆ 0625 ┆ 5 ┆ 21875 ┆ ┆ 5 ┆ 25 ┆ 5 ┆ 5 │ + │ std ┆ 6.026 ┆ 1.785 ┆ 123.9 ┆ … ┆ 0.504 ┆ 0.498 ┆ 0.737 ┆ 1.615 │ + │ ┆ 948 ┆ 922 ┆ 38694 ┆ ┆ 016 ┆ 991 ┆ 804 ┆ 2 │ + │ min ┆ 10.4 ┆ 4.0 ┆ 71.1 ┆ … ┆ 0.0 ┆ 0.0 ┆ 3.0 ┆ 1.0 │ + │ 25% ┆ 15.42 ┆ 4.0 ┆ 120.8 ┆ … ┆ 0.0 ┆ 0.0 ┆ 3.0 ┆ 2.0 │ + │ ┆ 5 ┆ ┆ 25 ┆ ┆ ┆ ┆ ┆ │ + │ 50% ┆ 19.2 ┆ 6.0 ┆ 196.3 ┆ … ┆ 0.0 ┆ 0.0 ┆ 4.0 ┆ 2.0 │ + │ 75% ┆ 22.8 ┆ 8.0 ┆ 326.0 ┆ … ┆ 1.0 ┆ 1.0 ┆ 4.0 ┆ 4.0 │ + │ max ┆ 33.9 ┆ 8.0 ┆ 472.0 ┆ … ┆ 1.0 ┆ 1.0 ┆ 5.0 ┆ 8.0 │ + └────────┴───────┴───────┴───────┴───┴───────┴───────┴───────┴───────┘ + +--- + + Code + df$select(pl$col("cat")$cast(pl$Categorical("lexical")))$describe() + Output + shape: (9, 2) + ┌────────────┬──────┐ + │ statistic ┆ cat │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════════╪══════╡ + │ count ┆ 2 │ + │ null_count ┆ 1 │ + │ mean ┆ null │ + │ std ┆ null │ + │ min ┆ a │ + │ 25% ┆ null │ + │ 50% ┆ null │ + │ 75% ┆ null │ + │ max ┆ zz │ + └────────────┴──────┘ + +# $glimpse() works + + Code + df$glimpse() + Output + & mpg 21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2 + & cyl 6, 6, 4, 6, 8, 6, 8, 4, 4, 6 + & disp 160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 167.6 + & hp 110, 110, 93, 110, 175, 105, 245, 62, 95, 123 + & drat 3.9, 3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92 + & wt 2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 3.15, 3.44 + & qsec 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3 + & vs 0, 0, 1, 1, 0, 1, 0, 1, 1, 1 + & am 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 + & gear 4, 4, 4, 3, 3, 3, 3, 4, 4, 4 + & carb 4, 4, 1, 1, 2, 1, 4, 2, 2, 4 + & literal 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 + +--- + + Code + df$glimpse(max_items_per_column = 2) + Output + & mpg 21, 21 + & cyl 6, 6 + & disp 160, 160 + & hp 110, 110 + & drat 3.9, 3.9 + & wt 2.62, 2.875 + & qsec 16.46, 17.02 + & vs 0, 0 + & am 1, 1 + & gear 4, 4 + & carb 4, 4 + & literal 42, 42 + +--- + + Code + df$glimpse(max_colname_length = 2) + Output + & ... 21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2 + & ... 6, 6, 4, 6, 8, 6, 8, 4, 4, 6 + & ... 160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 167.6 + & hp 110, 110, 93, 110, 175, 105, 245, 62, 95, 123 + & ... 3.9, 3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92 + & wt 2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 3.15, 3.44 + & ... 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3 + & vs 0, 0, 1, 1, 0, 1, 0, 1, 1, 1 + & am 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 + & ... 4, 4, 4, 3, 3, 3, 3, 4, 4, 4 + & ... 4, 4, 1, 1, 2, 1, 4, 2, 2, 4 + & ... 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 + diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index 88816f6e1..ac51a18b8 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -1761,3 +1761,80 @@ test_that("$to_dummies() works", { ) ) }) + +test_that("inequality joins work", { + east = pl$DataFrame( + id = c(100, 101, 102), + dur = c(120, 140, 160), + rev = c(12, 14, 16), + cores = c(2, 8, 4) + ) + west = pl$DataFrame( + t_id = c(404, 498, 676, 742), + time = c(90, 130, 150, 170), + cost = c(9, 13, 15, 16), + cores = c(4, 2, 1, 4) + ) + out = east$join_where( + west, + pl$col("dur") < pl$col("time"), + pl$col("rev") < pl$col("cost") + ) + + expect_identical( + out$to_data_frame(), + data.frame( + id = rep(c(100, 101), 3:2), + dur = rep(c(120, 140), 3:2), + rev = rep(c(12, 14), 3:2), + cores = rep(c(2, 8), 3:2), + t_id = c(498, 676, 742, 676, 742), + time = c(130, 150, 170, 150, 170), + cost = c(13, 15, 16, 15, 16), + cores_right = c(2, 1, 4, 1, 4) + ) + ) + + expect_error( + east$join_where( + west$lazy(), + pl$col("dur") < pl$col("time"), + pl$col("rev") < pl$col("cost") + ), + "`other` must be a DataFrame" + ) +}) + +test_that("inequality joins require suffix when identical column names", { + east = pl$DataFrame( + id = c(100, 101, 102), + dur = c(120, 140, 160), + rev = c(12, 14, 16), + cores = c(2, 8, 4) + ) + west = pl$DataFrame( + t_id = c(404, 498, 676, 742), + dur = c(90, 130, 150, 170), + rev = c(9, 13, 15, 16), + cores = c(4, 2, 1, 4) + ) + out = east$join_where( + west, + pl$col("dur") < pl$col("dur_right"), + pl$col("rev") < pl$col("rev_right") + ) + + expect_identical( + out$to_data_frame(), + data.frame( + id = rep(c(100, 101), 3:2), + dur = rep(c(120, 140), 3:2), + rev = rep(c(12, 14), 3:2), + cores = rep(c(2, 8), 3:2), + t_id = c(498, 676, 742, 676, 742), + dur_right = c(130, 150, 170, 150, 170), + rev_right = c(13, 15, 16, 15, 16), + cores_right = c(2, 1, 4, 1, 4) + ) + ) +}) diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index ce42b9f88..509e8d83e 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -1198,3 +1198,80 @@ test_that("$cast() works", { list(x = NA_integer_) ) }) + +test_that("inequality joins work", { + east = pl$LazyFrame( + id = c(100, 101, 102), + dur = c(120, 140, 160), + rev = c(12, 14, 16), + cores = c(2, 8, 4) + ) + west = pl$LazyFrame( + t_id = c(404, 498, 676, 742), + time = c(90, 130, 150, 170), + cost = c(9, 13, 15, 16), + cores = c(4, 2, 1, 4) + ) + out = east$join_where( + west, + pl$col("dur") < pl$col("time"), + pl$col("rev") < pl$col("cost") + )$collect() + + expect_identical( + out$to_data_frame(), + data.frame( + id = rep(c(100, 101), 3:2), + dur = rep(c(120, 140), 3:2), + rev = rep(c(12, 14), 3:2), + cores = rep(c(2, 8), 3:2), + t_id = c(498, 676, 742, 676, 742), + time = c(130, 150, 170, 150, 170), + cost = c(13, 15, 16, 15, 16), + cores_right = c(2, 1, 4, 1, 4) + ) + ) + + expect_error( + east$join_where( + west$collect(), + pl$col("dur") < pl$col("time"), + pl$col("rev") < pl$col("cost") + ), + "`other` must be a LazyFrame" + ) +}) + +test_that("inequality joins require suffix when identical column names", { + east = pl$LazyFrame( + id = c(100, 101, 102), + dur = c(120, 140, 160), + rev = c(12, 14, 16), + cores = c(2, 8, 4) + ) + west = pl$LazyFrame( + t_id = c(404, 498, 676, 742), + dur = c(90, 130, 150, 170), + rev = c(9, 13, 15, 16), + cores = c(4, 2, 1, 4) + ) + out = east$join_where( + west, + pl$col("dur") < pl$col("dur_right"), + pl$col("rev") < pl$col("rev_right") + )$collect() + + expect_identical( + out$to_data_frame(), + data.frame( + id = rep(c(100, 101), 3:2), + dur = rep(c(120, 140), 3:2), + rev = rep(c(12, 14), 3:2), + cores = rep(c(2, 8), 3:2), + t_id = c(498, 676, 742, 676, 742), + dur_right = c(130, 150, 170, 150, 170), + rev_right = c(13, 15, 16, 15, 16), + cores_right = c(2, 1, 4, 1, 4) + ) + ) +})