Skip to content

Commit

Permalink
melt + pivot (#232)
Browse files Browse the repository at this point in the history
Co-authored-by: eitsupi <[email protected]>
  • Loading branch information
sorhawell and eitsupi authored Jun 7, 2023
1 parent 47319e8 commit 20002fe
Show file tree
Hide file tree
Showing 13 changed files with 637 additions and 23 deletions.
96 changes: 96 additions & 0 deletions R/dataframe__frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -1270,3 +1270,99 @@ DataFrame_join_asof = function(
tolerance = tolerance
)$collect()
}




#' @inherit LazyFrame_melt
#' @keywords DataFrame
#'
#' @return A new `DataFrame`
#'
#' @examples
#' df = pl$DataFrame(
#' a = c("x", "y", "z"),
#' b = c(1, 3, 5),
#' c = c(2, 4, 6)
#' )
#' df$melt(id_vars = "a", value_vars = c("b", "c"))
DataFrame_melt = function(
id_vars = NULL,
value_vars = NULL,
variable_name = NULL,
value_name = NULL) {
.pr$DataFrame$melt(
self, id_vars %||% character(), value_vars %||% character(),
value_name, variable_name
) |> unwrap("in $melt( ): ")
}



#' Create a spreadsheet-style pivot table as a DataFrame.
#' @param values Column values to aggregate. Can be multiple columns if the `columns`
#' arguments contains multiple columns as well.
#' @param index One or multiple keys to group by.
#' @param columns Name of the column(s) whose values will be used as the header of the output
#' DataFrame.
#' @param aggregate_function
#' String naming Expr to aggregate with, or an Expr e.g. `pl$element()$sum()`,
#' examples of strings:'first', 'sum', 'max', 'min', 'mean', 'median', 'last', 'count'
#' @param maintain_order Sort the grouped keys so that the output order is predictable.
#' @param sort_columns Sort the transposed columns by name. Default is by order of discovery.
#' @param separator Used as separator/delimiter in generated column names.
#'
#' @return DataFrame
#' @keywords DataFrame
#' @examples
#' df = pl$DataFrame(
#' foo = c("one", "one", "one", "two", "two", "two"),
#' bar = c("A", "B", "C", "A", "B", "C"),
#' baz = c(1, 2, 3, 4, 5, 6)
#' )
#' df$pivot(
#' values = "baz", index = "foo", columns = "bar", aggregate_function = "first"
#' )
#'
#'
#' # Run an expression as aggregation function
#' df = pl$DataFrame(
#' col1 = c("a", "a", "a", "b", "b", "b"),
#' col2 = c("x", "x", "x", "x", "y", "y"),
#' col3 = c(6, 7, 3, 2, 5, 7)
#' )
#' df$pivot(
#' index = "col1",
#' columns = "col2",
#' values = "col3",
#' aggregate_function = pl$element()$tanh()$mean()
#' )
DataFrame_pivot = function(
values,
index,
columns,
aggregate_function = NULL,
maintain_order = TRUE,
sort_columns = FALSE,
separator = "_") {
pcase(
# if string, call it on Expr-method of pl$element() and capture any Error as Result
is_string(aggregate_function), result(`$.Expr`(pl$element(), aggregate_function)()),

# Expr or NULL pass as is
is.null(aggregate_function) || inherits(aggregate_function, "Expr"), Ok(aggregate_function),

# anything else pass err
or_else = Err(" is neither a string, NULL or an Expr")
) |>
# add param context
map_err(\(err_msg) paste(
"param [aggregate_function] being ", str_string(aggregate_function), err_msg
)) |>
# run pivot when valid aggregate_expr
and_then(\(aggregate_expr) .pr$DataFrame$pivot_expr(
self, values, index, columns, maintain_order, sort_columns, aggregate_expr, separator
)) |>
# unwrap and add method context name
unwrap("in $pivot():")
}
6 changes: 6 additions & 0 deletions R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ DataFrame$estimated_size <- function() .Call(wrap__DataFrame__estimated_size, se

DataFrame$null_count <- function() .Call(wrap__DataFrame__null_count, self)

DataFrame$melt <- function(id_vars, value_vars, value_name, variable_name) .Call(wrap__DataFrame__melt, self, id_vars, value_vars, value_name, variable_name)

DataFrame$pivot_expr <- function(values, index, columns, maintain_order, sort_columns, aggregate_expr, separator) .Call(wrap__DataFrame__pivot_expr, self, values, index, columns, maintain_order, sort_columns, aggregate_expr, separator)

#' @export
`$.DataFrame` <- function (self, name) { func <- DataFrame[[name]]; environment(func) <- environment(); func }

Expand Down Expand Up @@ -887,6 +891,8 @@ LazyFrame$join <- function(other, left_on, right_on, how, suffix, allow_parallel

LazyFrame$sort_by_exprs <- function(by, descending, nulls_last) .Call(wrap__LazyFrame__sort_by_exprs, self, by, descending, nulls_last)

LazyFrame$melt <- function(id_vars, value_vars, value_name, variable_name, streamable) .Call(wrap__LazyFrame__melt, self, id_vars, value_vars, value_name, variable_name, streamable)

#' @export
`$.LazyFrame` <- function (self, name) { func <- LazyFrame[[name]]; environment(func) <- environment(); func }

Expand Down
46 changes: 46 additions & 0 deletions R/lazyframe__lazy.R
Original file line number Diff line number Diff line change
Expand Up @@ -752,3 +752,49 @@ LazyFrame_join_asof = function(
) |>
unwrap("in join_asof( ):")
}


#' Unpivot a Frame from wide to long format
#'
#' @param id_vars char vec, columns to use as identifier variables.
#' @param value_vars char vec, Values to use as identifier variables.
#' If `value_vars` is empty all columns that are not in `id_vars` will be used.
#' @param variable_name string, Name to give to the `variable` column. Defaults to "variable"
#' @param value_name string, Name to give to the `value` column. Defaults to "value"
#' @param ... not used, forces to name streamable arg
#' @param streamable Allow this node to run in the streaming engine.
#' If this runs in streaming, the output of the melt operation
#' will not have a stable ordering.
#'
#' @details
#' Optionally leaves identifiers set.
#'
#' This function is useful to massage a DataFrame into a format where one or more
#' columns are identifier variables (id_vars), while all other columns, considered
#' measured variables (value_vars), are "unpivoted" to the row axis, leaving just
#' two non-identifier columns, 'variable' and 'value'.
#'
#' @keywords LazyFrame
#'
#' @return A new `LazyFrame`
#'
#' @examples
#' lf = pl$DataFrame(
#' a = c("x", "y", "z"),
#' b = c(1, 3, 5),
#' c = c(2, 4, 6)
#' )$lazy()
#' lf$melt(id_vars = "a", value_vars = c("b", "c"))$collect()
#'
LazyFrame_melt = function(
id_vars = NULL,
value_vars = NULL,
variable_name = NULL,
value_name = NULL,
...,
streamable = TRUE) {
.pr$LazyFrame$melt(
self, id_vars %||% character(), value_vars %||% character(),
value_name, variable_name, streamable
) |> unwrap("in $melt( ): ")
}
46 changes: 46 additions & 0 deletions man/DataFrame_melt.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

65 changes: 65 additions & 0 deletions man/DataFrame_pivot.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

55 changes: 55 additions & 0 deletions man/LazyFrame_melt.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions src/rust/src/conversion.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
use smartstring::alias::String as SmartString;
pub(crate) fn strings_to_smartstrings<I, S>(container: I) -> Vec<SmartString>
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
container.into_iter().map(|s| s.as_ref().into()).collect()
}
20 changes: 20 additions & 0 deletions src/rust/src/lazy/dataframe.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::concurrent::{handle_thread_r_requests, PolarsBackgroundHandle};
use crate::conversion::strings_to_smartstrings;
use crate::lazy::dsl::*;
use crate::rdatatype::new_asof_strategy;
use crate::rdatatype::new_join_type;
Expand All @@ -9,6 +10,7 @@ use crate::utils::wrappers::null_to_opt;
use crate::utils::{r_result_list, try_f64_into_usize};
use extendr_api::prelude::*;
use polars::chunked_array::object::AsOfOptions;
use polars::frame::explode::MeltArgs;
use polars::frame::hash_join::JoinType;
use polars::prelude as pl;

Expand Down Expand Up @@ -335,6 +337,24 @@ impl LazyFrame {
let nulls_last = robj_to!(bool, nulls_last)?;
Ok(ldf.sort_by_exprs(exprs, descending, nulls_last).into())
}

fn melt(
&self,
id_vars: Robj,
value_vars: Robj,
value_name: Robj,
variable_name: Robj,
streamable: Robj,
) -> Result<Self, String> {
let args = MeltArgs {
id_vars: strings_to_smartstrings(robj_to!(Vec, String, id_vars)?),
value_vars: strings_to_smartstrings(robj_to!(Vec, String, value_vars)?),
value_name: robj_to!(Option, String, value_name)?.map(|s| s.into()),
variable_name: robj_to!(Option, String, variable_name)?.map(|s| s.into()),
streamable: robj_to!(bool, streamable)?,
};
Ok(self.0.clone().melt(args).into())
}
}

#[derive(Clone)]
Expand Down
1 change: 1 addition & 0 deletions src/rust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ pub mod concurrent;
pub mod lazy;

pub mod arrow_interop;
pub mod conversion;
pub mod conversion_r_to_s;
pub mod conversion_s_to_r;
pub mod rdataframe;
Expand Down
Loading

0 comments on commit 20002fe

Please sign in to comment.