Merge branch 'main' into polars-0.38.2

pola-rs · Mar 11, 2024 · 0af2cf7 · 0af2cf7
2 parents 49caae7 + ee6b7f9
commit 0af2cf7
Show file tree

Hide file tree

Showing 13 changed files with 413 additions and 25 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -32,3 +32,4 @@
 ^\.editorconfig$
 ^rustfmt\.toml$
 ^\.lintr\.R$
+^\.mega-linter\.yml$
diff --git a/.editorconfig b/.editorconfig
@@ -3,6 +3,8 @@ root = true
 [*]
 insert_final_newline = true
 end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
 
 [*.md]
 indent_style = space

diff --git a/.github/workflows/mega-linter.yaml b/.github/workflows/mega-linter.yaml
@@ -0,0 +1,79 @@
+---
+# MegaLinter GitHub Action configuration file
+# More info at https://megalinter.io
+name: MegaLinter
+
+on:
+  pull_request:
+    branches:
+      - main
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.ref }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+jobs:
+  megalinter:
+    name: MegaLinter
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v4
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          fetch-depth: 0
+
+      - name: MegaLinter
+        id: ml
+        # You can override MegaLinter flavor used to have faster performances
+        # More info at https://megalinter.io/flavors/
+        uses: oxsecurity/megalinter/flavors/[email protected]
+        env:
+          # All available variables are described in documentation
+          # https://megalinter.io/configuration/
+          VALIDATE_ALL_CODEBASE: true
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          # ADD YOUR CUSTOM ENV VARIABLES HERE OR DEFINE THEM IN A FILE .mega-linter.yml AT THE ROOT OF YOUR REPOSITORY
+          # DISABLE: COPYPASTE,SPELL # Uncomment to disable copy-paste and spell checks
+
+      # Upload MegaLinter artifacts
+      - name: Archive production artifacts
+        if: success() || failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: MegaLinter reports
+          path: |
+            megalinter-reports
+            mega-linter.log
+
+      # Create pull request if applicable (for now works only on PR from same repository, not from forks)
+      - name: Create Pull Request with applied fixes
+        id: cpr
+        if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'pull_request' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix')
+        uses: peter-evans/create-pull-request@v6
+        with:
+          token: ${{ secrets.PAT || secrets.GITHUB_TOKEN }}
+          commit-message: "[MegaLinter] Apply linters automatic fixes"
+          title: "[MegaLinter] Apply linters automatic fixes"
+          labels: bot
+      - name: Create PR output
+        if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'pull_request' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix')
+        run: |
+          echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
+          echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
+
+      # Push new commit if applicable (for now works only on PR from same repository, not from forks)
+      - name: Prepare commit
+        if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'commit' && github.ref != 'refs/heads/main' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix')
+        run: sudo chown -Rc $UID .git/
+      - name: Commit and push applied linter fixes
+        if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'commit' && github.ref != 'refs/heads/main' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && !contains(github.event.head_commit.message, 'skip fix')
+        uses: stefanzweifel/git-auto-commit-action@v4
+        with:
+          branch: ${{ github.event.pull_request.head.ref || github.head_ref || github.ref }}
+          commit_message: "[MegaLinter] Apply linters fixes"
+          commit_user_name: megalinter-bot
+          commit_user_email: [email protected]
diff --git a/.mega-linter.yml b/.mega-linter.yml
@@ -0,0 +1,6 @@
+APPLY_FIXES: all
+DISABLE:
+  - RUST
+  - R
+DISABLE_LINTERS:
+  - SPELL_CSPELL
diff --git a/NEWS.md b/NEWS.md
@@ -10,6 +10,7 @@
 -   It is now possible to create an empty `DataFrame` with a specific schema
     with `pl$DataFrame(schema = my_schema)` (#901).
 -   New arguments `dtype` and `nan_to_null` for `pl$Series()` (#902).
+-   New method `<DataFrame>$partition_by()` (#898).
 
 ### Bug fixes
 

diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R
@@ -884,6 +884,8 @@ DataFrame_filter = function(...) {
 #' @details Within each group, the order of the rows is always preserved,
 #' regardless of the `maintain_order` argument.
 #' @return [GroupBy][GroupBy_class] (a DataFrame with special groupby methods like `$agg()`)
+#' @seealso
+#' - [`<DataFrame>$partition_by()`][DataFrame_partition_by]
 #' @examples
 #' df = pl$DataFrame(
 #'   a = c("a", "b", "a", "b", "c"),
@@ -2093,3 +2095,108 @@ DataFrame_group_by_dynamic = function(
     by, start_by, check_sorted
   )
 }
+
+
+#' Split a DataFrame into multiple DataFrames
+#'
+#' Similar to [`$group_by()`][DataFrame_group_by].
+#' Group by the given columns and return the groups as separate [DataFrames][DataFrame_class].
+#' It is useful to use this in combination with functions like [lapply()] or `purrr::map()`.
+#' @param ... Characters of column names to group by. Passed to [`pl$col()`][pl_col].
+#' @param maintain_order If `TRUE`, ensure that the order of the groups is consistent with the input data.
+#' This is slower than a default partition by operation.
+#' @param include_key If `TRUE`, include the columns used to partition the DataFrame in the output.
+#' @param as_nested_list This affects the format of the output.
+#' If `FALSE` (default), the output is a flat [list] of [DataFrames][DataFrame_class].
+#' IF `TRUE` and one of the `maintain_order` or `include_key` argument is `TRUE`,
+#' then each element of the output has two children: `key` and `data`.
+#' See the examples for more details.
+#' @return A list of [DataFrames][DataFrame_class]. See the examples for details.
+#' @seealso
+#' - [`<DataFrame>$group_by()`][DataFrame_group_by]
+#' @examples
+#' df = pl$DataFrame(
+#'   a = c("a", "b", "a", "b", "c"),
+#'   b = c(1, 2, 1, 3, 3),
+#'   c = c(5, 4, 3, 2, 1)
+#' )
+#' df
+#'
+#' # Pass a single column name to partition by that column.
+#' df$partition_by("a")
+#'
+#' # Partition by multiple columns.
+#' df$partition_by("a", "b")
+#'
+#' # Partition by column data type
+#' df$partition_by(pl$String)
+#'
+#' # If `as_nested_list = TRUE`, the output is a list whose elements have a `key` and a `data` field.
+#' # The `key` is a named list of the key values, and the `data` is the DataFrame.
+#' df$partition_by("a", "b", as_nested_list = TRUE)
+#'
+#' # `as_nested_list = TRUE` should be used with `maintain_order = TRUE` or `include_key = TRUE`.
+#' tryCatch(
+#'   df$partition_by("a", "b", maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE),
+#'   warning = function(w) w
+#' )
+#'
+#' # Example of using with lapply(), and printing the key and the data summary
+#' df$partition_by("a", "b", maintain_order = FALSE, as_nested_list = TRUE) |>
+#'   lapply(\(x) {
+#'     sprintf("\nThe key value of `a` is %s and the key value of `b` is %s\n", x$key$a, x$key$b) |>
+#'       cat()
+#'     x$data$drop(names(x$key))$describe() |>
+#'       print()
+#'     invisible(NULL)
+#'   }) |>
+#'   invisible()
+DataFrame_partition_by = function(
+    ...,
+    maintain_order = TRUE,
+    include_key = TRUE,
+    as_nested_list = FALSE) {
+  uw = \(res) unwrap(res, "in $partition_by():")
+
+  by = result(dots_to_colnames(self, ...)) |>
+    uw()
+
+  if (!length(by)) {
+    Err_plain("There is no column to partition by.") |>
+      uw()
+  }
+
+  partitions = .pr$DataFrame$partition_by(self, by, maintain_order, include_key) |>
+    uw()
+
+  if (isTRUE(as_nested_list)) {
+    if (include_key) {
+      out = lapply(seq_along(partitions), \(index) {
+        data = partitions[[index]]
+        key = data$select(by)$head(1)$to_list()
+
+        list(key = key, data = data)
+      })
+
+      return(out)
+    } else if (maintain_order) {
+      key_df = self$select(by)$unique(maintain_order = TRUE)
+      out = lapply(seq_along(partitions), \(index) {
+        data = partitions[[index]]
+        key = key_df$slice(index - 1, 1)$to_list()
+
+        list(key = key, data = data)
+      })
+
+      return(out)
+    } else {
+      warning(
+        "cannot use `$partition_by` with ",
+        "`maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE`. ",
+        "Fall back to a flat list."
+      )
+    }
+  }
+
+  partitions
+}
diff --git a/R/dotdotdot.R b/R/dotdotdot.R
@@ -45,9 +45,9 @@ unpack_list = function(..., .context = NULL, .call = sys.call(1L), skip_classes
   l = list2(..., .context = .context, .call = .call)
   if (
     length(l) == 1L &&
-    is.list(l[[1L]]) &&
-    !(!is.null(skip_classes) && inherits(l[[1L]], skip_classes)) &&
-    is.null(names(l))
+      is.list(l[[1L]]) &&
+      !(!is.null(skip_classes) && inherits(l[[1L]], skip_classes)) &&
+      is.null(names(l))
   ) {
     l[[1L]]
   } else {
@@ -79,3 +79,13 @@ unpack_bool_expr_result = function(...) {
       }
     })
 }
+
+
+#' Convert dots to a character vector of column names
+#' @param .df [RPolarsDataFrame]
+#' @param ... Arguments to pass to [`pl$col()`][pl_col]
+#' @noRd
+dots_to_colnames = function(.df, ..., .call = sys.call(1L)) {
+  result(pl$DataFrame(schema = .df$schema)$select(pl$col(...))$columns) |>
+    unwrap(call = .call)
+}
diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R
@@ -178,6 +178,8 @@ RPolarsDataFrame$to_struct <- function(name) .Call(wrap__RPolarsDataFrame__to_st
 
 RPolarsDataFrame$unnest <- function(names) .Call(wrap__RPolarsDataFrame__unnest, self, names)
 
+RPolarsDataFrame$partition_by <- function(by, maintain_order, include_key) .Call(wrap__RPolarsDataFrame__partition_by, self, by, maintain_order, include_key)
+
 RPolarsDataFrame$export_stream <- function(stream_ptr) invisible(.Call(wrap__RPolarsDataFrame__export_stream, self, stream_ptr))
 
 RPolarsDataFrame$from_arrow_record_batches <- function(rbr) .Call(wrap__RPolarsDataFrame__from_arrow_record_batches, rbr)

diff --git a/man/DataFrame_group_by.Rd b/man/DataFrame_group_by.Rd
diff --git a/man/DataFrame_partition_by.Rd b/man/DataFrame_partition_by.Rd
diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs
@@ -328,6 +328,21 @@ impl RPolarsDataFrame {
         self.lazy().unnest(names)?.collect()
     }
 
+    pub fn partition_by(&self, by: Robj, maintain_order: Robj, include_key: Robj) -> RResult<List> {
+        let by = robj_to!(Vec, String, by)?;
+        let maintain_order = robj_to!(bool, maintain_order)?;
+        let include_key = robj_to!(bool, include_key)?;
+        let out = if maintain_order {
+            self.0.clone().partition_by_stable(by, include_key)
+        } else {
+            self.0.partition_by(by, include_key)
+        }
+        .map_err(polars_to_rpolars_err)?;
+
+        let vec = unsafe { std::mem::transmute::<Vec<pl::DataFrame>, Vec<RPolarsDataFrame>>(out) };
+        Ok(List::from_values(vec))
+    }
+
     pub fn export_stream(&self, stream_ptr: &str) {
         let schema = self.0.schema().to_arrow(false);
         let data_type = ArrowDataType::Struct(schema.fields);