From 471c070d841914005a3f66109ff4c7c79cbeadda Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sun, 7 Jul 2024 04:16:44 +0200 Subject: [PATCH] Implement `$str$extract_many()` (#1163) --- DESCRIPTION | 2 +- NEWS.md | 4 ++ R/expr__string.R | 31 +++++++++++++++ R/extendr-wrappers.R | 2 + man/ExprStr_extract_many.Rd | 47 +++++++++++++++++++++++ src/rust/Cargo.toml | 2 +- src/rust/src/lazy/dsl.rs | 18 +++++++++ tests/testthat/_snaps/after-wrappers.md | 51 +++++++++++++------------ tests/testthat/test-expr_string.R | 35 +++++++++++++++++ tools/lib-sums.tsv | 6 --- 10 files changed, 165 insertions(+), 33 deletions(-) create mode 100644 man/ExprStr_extract_many.Rd delete mode 100644 tools/lib-sums.tsv diff --git a/DESCRIPTION b/DESCRIPTION index 197a1574f..c4458e61e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -118,5 +118,5 @@ Collate: 'zzz.R' Config/rextendr/version: 0.3.1 VignetteBuilder: knitr -Config/polars/LibVersion: 0.41.0 +Config/polars/LibVersion: 0.41.1 Config/polars/RustToolchainVersion: nightly-2024-06-23 diff --git a/NEWS.md b/NEWS.md index e1b9a56df..1dea634d4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,10 @@ ## Polars R Package (development version) +### New features + +- New method `$str$extract_many()` (#1163). + ## Polars R Package 0.18.0 ### Breaking changes diff --git a/R/expr__string.R b/R/expr__string.R index 1cb764373..f39f19d49 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -1093,3 +1093,34 @@ ExprStr_tail = function(n) { .pr$Expr$str_tail(self, n) |> unwrap("in $str$tail():") } + + +#' Use the aho-corasick algorithm to extract matches +#' +#' @param patterns String patterns to search. This can be an Expr or something +#' coercible to an Expr. Strings are parsed as column names. +#' @inheritParams ExprStr_contains_any +#' @param ... Ignored. +#' @param overlapping Whether matches can overlap. +#' +#' @inherit ExprStr_slice return +#' +#' @examples +#' df = pl$DataFrame(values = "discontent") +#' patterns = pl$lit(c("winter", "disco", "onte", "discontent")) +#' +#' df$with_columns( +#' matches = pl$col("values")$str$extract_many(patterns), +#' matches_overlap = pl$col("values")$str$extract_many(patterns, overlapping = TRUE) +#' ) +#' +#' df = pl$DataFrame( +#' values = c("discontent", "rhapsody"), +#' patterns = list(c("winter", "disco", "onte", "discontent"), c("rhap", "ody", "coalesce")) +#' ) +#' +#' df$select(pl$col("values")$str$extract_many("patterns")) +ExprStr_extract_many = function(patterns, ..., ascii_case_insensitive = FALSE, overlapping = FALSE) { + .pr$Expr$str_extract_many(self, patterns, ascii_case_insensitive, overlapping) |> + unwrap("in $str$extract_many():") +} diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index dda668b79..46d638d33 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -1098,6 +1098,8 @@ RPolarsExpr$str_contains_any <- function(patterns, ascii_case_insensitive) .Call RPolarsExpr$str_replace_many <- function(patterns, replace_with, ascii_case_insensitive) .Call(wrap__RPolarsExpr__str_replace_many, self, patterns, replace_with, ascii_case_insensitive) +RPolarsExpr$str_extract_many <- function(patterns, ascii_case_insensitive, overlapping) .Call(wrap__RPolarsExpr__str_extract_many, self, patterns, ascii_case_insensitive, overlapping) + RPolarsExpr$str_find <- function(pat, literal, strict) .Call(wrap__RPolarsExpr__str_find, self, pat, literal, strict) RPolarsExpr$str_head <- function(n) .Call(wrap__RPolarsExpr__str_head, self, n) diff --git a/man/ExprStr_extract_many.Rd b/man/ExprStr_extract_many.Rd new file mode 100644 index 000000000..3fc192671 --- /dev/null +++ b/man/ExprStr_extract_many.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/expr__string.R +\name{ExprStr_extract_many} +\alias{ExprStr_extract_many} +\title{Use the aho-corasick algorithm to extract matches} +\usage{ +ExprStr_extract_many( + patterns, + ..., + ascii_case_insensitive = FALSE, + overlapping = FALSE +) +} +\arguments{ +\item{patterns}{String patterns to search. This can be an Expr or something +coercible to an Expr. Strings are parsed as column names.} + +\item{...}{Ignored.} + +\item{ascii_case_insensitive}{Enable ASCII-aware case insensitive matching. +When this option is enabled, searching will be performed without respect to +case for ASCII letters (a-z and A-Z) only.} + +\item{overlapping}{Whether matches can overlap.} +} +\value{ +Expr: Series of dtype String. +} +\description{ +Use the aho-corasick algorithm to extract matches +} +\examples{ +df = pl$DataFrame(values = "discontent") +patterns = pl$lit(c("winter", "disco", "onte", "discontent")) + +df$with_columns( + matches = pl$col("values")$str$extract_many(patterns), + matches_overlap = pl$col("values")$str$extract_many(patterns, overlapping = TRUE) +) + +df = pl$DataFrame( + values = c("discontent", "rhapsody"), + patterns = list(c("winter", "disco", "onte", "discontent"), c("rhap", "ody", "coalesce")) +) + +df$select(pl$col("values")$str$extract_many("patterns")) +} diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index 18b785800..6b516f518 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "r-polars" -version = "0.41.0" +version = "0.41.1" edition = "2021" rust-version = "1.79.0" publish = false diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 88d203d2e..3a500e942 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -2458,6 +2458,24 @@ impl RPolarsExpr { .into()) } + fn str_extract_many( + &self, + patterns: Robj, + ascii_case_insensitive: Robj, + overlapping: Robj, + ) -> RResult { + Ok(self + .0 + .clone() + .str() + .extract_many( + robj_to!(PLExprCol, patterns)?, + robj_to!(bool, ascii_case_insensitive)?, + robj_to!(bool, overlapping)?, + ) + .into()) + } + pub fn str_find(&self, pat: Robj, literal: Robj, strict: Robj) -> RResult { let pat = robj_to!(PLExpr, pat)?; let literal = robj_to!(Option, bool, literal)?; diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index 29f35f8c3..16dc2c5bd 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -424,31 +424,32 @@ [279] "str_contains" "str_contains_any" [281] "str_count_matches" "str_ends_with" [283] "str_extract" "str_extract_all" - [285] "str_extract_groups" "str_find" - [287] "str_head" "str_hex_decode" - [289] "str_hex_encode" "str_join" - [291] "str_json_decode" "str_json_path_match" - [293] "str_len_bytes" "str_len_chars" - [295] "str_pad_end" "str_pad_start" - [297] "str_replace" "str_replace_all" - [299] "str_replace_many" "str_reverse" - [301] "str_slice" "str_split" - [303] "str_split_exact" "str_splitn" - [305] "str_starts_with" "str_strip_chars" - [307] "str_strip_chars_end" "str_strip_chars_start" - [309] "str_tail" "str_to_date" - [311] "str_to_datetime" "str_to_integer" - [313] "str_to_lowercase" "str_to_time" - [315] "str_to_titlecase" "str_to_uppercase" - [317] "str_zfill" "struct_field_by_name" - [319] "struct_rename_fields" "struct_with_fields" - [321] "sub" "sum" - [323] "tail" "tan" - [325] "tanh" "to_physical" - [327] "top_k" "unique" - [329] "unique_counts" "unique_stable" - [331] "upper_bound" "value_counts" - [333] "var" "xor" + [285] "str_extract_groups" "str_extract_many" + [287] "str_find" "str_head" + [289] "str_hex_decode" "str_hex_encode" + [291] "str_join" "str_json_decode" + [293] "str_json_path_match" "str_len_bytes" + [295] "str_len_chars" "str_pad_end" + [297] "str_pad_start" "str_replace" + [299] "str_replace_all" "str_replace_many" + [301] "str_reverse" "str_slice" + [303] "str_split" "str_split_exact" + [305] "str_splitn" "str_starts_with" + [307] "str_strip_chars" "str_strip_chars_end" + [309] "str_strip_chars_start" "str_tail" + [311] "str_to_date" "str_to_datetime" + [313] "str_to_integer" "str_to_lowercase" + [315] "str_to_time" "str_to_titlecase" + [317] "str_to_uppercase" "str_zfill" + [319] "struct_field_by_name" "struct_rename_fields" + [321] "struct_with_fields" "sub" + [323] "sum" "tail" + [325] "tan" "tanh" + [327] "to_physical" "top_k" + [329] "unique" "unique_counts" + [331] "unique_stable" "upper_bound" + [333] "value_counts" "var" + [335] "xor" # public and private methods of each class When diff --git a/tests/testthat/test-expr_string.R b/tests/testthat/test-expr_string.R index c8dee7327..2acf83f6b 100644 --- a/tests/testthat/test-expr_string.R +++ b/tests/testthat/test-expr_string.R @@ -920,3 +920,38 @@ test_that("$str$tail() works", { ) ) }) + +test_that("$str$extract_many() works", { + df = pl$DataFrame(values = c("discontent", "dollar $")) + patterns = pl$lit(c("winter", "disco", "ONTE", "discontent", "$")) + + expect_equal( + df$select( + matches = pl$col("values")$str$extract_many(patterns), + matches_overlap = pl$col("values")$str$extract_many(patterns, overlapping = TRUE) + )$to_list(), + list(matches = list("disco", "$"), matches_overlap = list(c("disco", "discontent"), "$")) + ) + + # arg "ascii_case_insensitive" works + expect_equal( + df$select( + matches_overlap = pl$col("values")$str$extract_many( + patterns, + ascii_case_insensitive = TRUE, overlapping = TRUE + ) + )$to_list(), + list(matches_overlap = list(c("disco", "onte", "discontent"), "$")) + ) + + # can pass column names as strings + df = pl$DataFrame( + values = c("discontent", "rhapsody"), + patterns = list(c("winter", "disco", "onte", "discontent"), c("rhap", "ody", "coalesce")) + ) + + expect_equal( + df$select(pl$col("values")$str$extract_many("patterns"))$to_list(), + list(values = list("disco", c("rhap", "ody"))) + ) +}) diff --git a/tools/lib-sums.tsv b/tools/lib-sums.tsv deleted file mode 100644 index 9dccf0806..000000000 --- a/tools/lib-sums.tsv +++ /dev/null @@ -1,6 +0,0 @@ -url sha256sum -https://github.com/pola-rs/r-polars/releases/download/lib-v0.41.0/libr_polars-0.41.0-aarch64-apple-darwin.tar.gz 7816ec3d1a4b81942a3db9175f4931918c72c49f573f3cadcaccefcc49781ef1 -https://github.com/pola-rs/r-polars/releases/download/lib-v0.41.0/libr_polars-0.41.0-aarch64-unknown-linux-gnu.tar.gz 1775c3a04697d6cc84f1b2450f9721b3ff91ee132ae0cfc69c32eeec9197dbef -https://github.com/pola-rs/r-polars/releases/download/lib-v0.41.0/libr_polars-0.41.0-x86_64-apple-darwin.tar.gz 78689bc91e3b0a9b8e580d896e98f1f63d1d64c8254726ca6041c50db5e228a5 -https://github.com/pola-rs/r-polars/releases/download/lib-v0.41.0/libr_polars-0.41.0-x86_64-pc-windows-gnu.tar.gz c4b226271fa29cd3a533677fae3afbe55ce76f0107ee74aa06e8f60b3ed298e9 -https://github.com/pola-rs/r-polars/releases/download/lib-v0.41.0/libr_polars-0.41.0-x86_64-unknown-linux-gnu.tar.gz ef4ad1788d95b9c0167c7d4a565858ac2115effce67fbaae39122d5c7f84ef42