From 408b82f906d9f915204555d8ef859396ba8865c2 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 12:42:51 +0200 Subject: [PATCH 01/47] init --- src/rust/Cargo.lock | 498 +++++++++++++++++++++++++++------ src/rust/Cargo.toml | 7 +- src/rust/src/lazy/dataframe.rs | 17 +- src/rust/src/rdataframe/mod.rs | 22 +- 4 files changed, 431 insertions(+), 113 deletions(-) diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index e5744a7c1..5b0219634 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -96,6 +96,18 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf7d0a018de4f6aa429b9d33d69edf69072b1c5b1cb8d3e4a5f7ef898fc3eb76" +[[package]] +name = "arrayref" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" + +[[package]] +name = "arrayvec" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" + [[package]] name = "async-stream" version = "0.3.5" @@ -144,6 +156,12 @@ version = "0.15.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ae037714f313c1353189ead58ef9eec30a8e8dc101b2622d461418fd59e28a9" +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" version = "1.1.0" @@ -215,6 +233,19 @@ dependencies = [ "serde", ] +[[package]] +name = "blake3" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30cca6d3674597c30ddf2c587bf8d9d65c9a84d2326d941cc79c9842dfe0ef52" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -348,6 +379,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "constant_time_eq" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" + [[package]] name = "core-foundation" version = "0.9.4" @@ -630,6 +667,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs4" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8" +dependencies = [ + "rustix", + "windows-sys 0.52.0", +] + [[package]] name = "futures" version = "0.3.30" @@ -778,7 +825,26 @@ dependencies = [ "futures-core", "futures-sink", "futures-util", - "http", + "http 0.2.12", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "h2" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http 1.1.0", "indexmap", "slab", "tokio", @@ -855,6 +921,17 @@ dependencies = [ "itoa", ] +[[package]] +name = "http" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + [[package]] name = "http-body" version = "0.4.6" @@ -862,7 +939,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" dependencies = [ "bytes", - "http", + "http 0.2.12", + "pin-project-lite", +] + +[[package]] +name = "http-body" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +dependencies = [ + "bytes", + "http 1.1.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", "pin-project-lite", ] @@ -894,9 +994,9 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2", - "http", - "http-body", + "h2 0.3.26", + "http 0.2.12", + "http-body 0.4.6", "httparse", "httpdate", "itoa", @@ -908,18 +1008,62 @@ dependencies = [ "want", ] +[[package]] +name = "hyper" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe575dd17d0862a9a33781c8c4696a55c320909004a67a00fb286ba8b1bc496d" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2 0.4.5", + "http 1.1.0", + "http-body 1.0.0", + "httparse", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + [[package]] name = "hyper-rustls" -version = "0.24.2" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" dependencies = [ "futures-util", - "http", - "hyper", + "http 1.1.0", + "hyper 1.3.1", + "hyper-util", "rustls", + "rustls-native-certs", + "rustls-pki-types", "tokio", "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b875924a60b96e5d7b9ae7b066540b1dd1cbd90d1828f54c92e02a283351c56" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "hyper 1.3.1", + "pin-project-lite", + "socket2", + "tokio", + "tower", + "tower-service", + "tracing", ] [[package]] @@ -1444,26 +1588,26 @@ dependencies = [ [[package]] name = "object_store" -version = "0.9.1" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8718f8b65fdf67a45108d1548347d4af7d71fb81ce727bbf9e3b2535e079db3" +checksum = "fbebfd32c213ba1907fa7a9c9138015a8de2b43e30c5aa45b18f7deb46786ad6" dependencies = [ "async-trait", - "base64 0.21.7", + "base64 0.22.0", "bytes", "chrono", "futures", "humantime", - "hyper", + "hyper 1.3.1", "itertools", "md-5", "parking_lot", "percent-encoding", "quick-xml", "rand", - "reqwest", + "reqwest 0.12.5", "ring", - "rustls-pemfile 2.1.2", + "rustls-pemfile", "serde", "serde_json", "snafu", @@ -1583,6 +1727,26 @@ dependencies = [ "siphasher", ] +[[package]] +name = "pin-project" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.50", +] + [[package]] name = "pin-project-lite" version = "0.2.13" @@ -1612,8 +1776,8 @@ dependencies = [ [[package]] name = "polars" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "getrandom", "polars-arrow", @@ -1632,8 +1796,8 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "ahash", "atoi", @@ -1679,8 +1843,8 @@ dependencies = [ [[package]] name = "polars-compute" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "bytemuck", "either", @@ -1694,8 +1858,8 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1728,8 +1892,8 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "avro-schema", "object_store", @@ -1741,8 +1905,8 @@ dependencies = [ [[package]] name = "polars-expr" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1760,17 +1924,19 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "ahash", "async-trait", "atoi_simd", + "blake3", "bytes", "chrono", "chrono-tz", "fast-float", "flate2", + "fs4", "futures", "home", "itoa", @@ -1789,7 +1955,7 @@ dependencies = [ "polars-utils", "rayon", "regex", - "reqwest", + "reqwest 0.11.27", "ryu", "serde", "serde_json", @@ -1804,8 +1970,8 @@ dependencies = [ [[package]] name = "polars-json" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "ahash", "chrono", @@ -1824,19 +1990,21 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "ahash", "bitflags 2.4.2", "futures", "glob", + "memchr", "once_cell", "polars-arrow", "polars-core", "polars-expr", "polars-io", "polars-json", + "polars-mem-engine", "polars-ops", "polars-pipe", "polars-plan", @@ -1848,10 +2016,30 @@ dependencies = [ "version_check", ] +[[package]] +name = "polars-mem-engine" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +dependencies = [ + "futures", + "polars-arrow", + "polars-core", + "polars-error", + "polars-expr", + "polars-io", + "polars-json", + "polars-ops", + "polars-plan", + "polars-time", + "polars-utils", + "rayon", + "tokio", +] + [[package]] name = "polars-ops" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "ahash", "aho-corasick", @@ -1886,8 +2074,8 @@ dependencies = [ [[package]] name = "polars-parquet" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "ahash", "async-stream", @@ -1900,9 +2088,11 @@ dependencies = [ "num-traits", "parquet-format-safe", "polars-arrow", + "polars-compute", "polars-error", "polars-utils", "seq-macro", + "serde", "simdutf8", "snap", "streaming-decompression", @@ -1911,8 +2101,8 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -1938,12 +2128,11 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "ahash", "bytemuck", - "chrono", "chrono-tz", "either", "futures", @@ -1969,8 +2158,8 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "bytemuck", "polars-arrow", @@ -1980,8 +2169,8 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "hex", "once_cell", @@ -1991,6 +2180,7 @@ dependencies = [ "polars-lazy", "polars-ops", "polars-plan", + "polars-time", "rand", "serde", "serde_json", @@ -1999,8 +2189,8 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "atoi", "bytemuck", @@ -2020,8 +2210,8 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.0" +source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" dependencies = [ "ahash", "bytemuck", @@ -2072,6 +2262,53 @@ dependencies = [ "serde", ] +[[package]] +name = "quinn" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4ceeeeabace7857413798eb1ffa1e9c905a9946a57d81fb69b4b71c4d8eb3ad" +dependencies = [ + "bytes", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls", + "thiserror", + "tokio", + "tracing", +] + +[[package]] +name = "quinn-proto" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddf517c03a109db8100448a4be38d498df8a210a99fe0e1b9eaf39e78c640efe" +dependencies = [ + "bytes", + "rand", + "ring", + "rustc-hash", + "rustls", + "slab", + "thiserror", + "tinyvec", + "tracing", +] + +[[package]] +name = "quinn-udp" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9096629c45860fc7fb143e125eb826b5e721e10be3263160c7d60ca832cf8c46" +dependencies = [ + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.52.0", +] + [[package]] name = "quote" version = "1.0.35" @@ -2283,11 +2520,48 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", - "h2", - "http", - "http-body", - "hyper", + "h2 0.3.26", + "http 0.2.12", + "http-body 0.4.6", + "hyper 0.14.28", + "ipnet", + "js-sys", + "log", + "mime", + "once_cell", + "percent-encoding", + "pin-project-lite", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper 0.1.2", + "system-configuration", + "tokio", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "winreg 0.50.0", +] + +[[package]] +name = "reqwest" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +dependencies = [ + "base64 0.22.0", + "bytes", + "futures-core", + "futures-util", + "h2 0.4.5", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "hyper 1.3.1", "hyper-rustls", + "hyper-util", "ipnet", "js-sys", "log", @@ -2295,14 +2569,15 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", + "quinn", "rustls", "rustls-native-certs", - "rustls-pemfile 1.0.4", + "rustls-pemfile", + "rustls-pki-types", "serde", "serde_json", "serde_urlencoded", - "sync_wrapper", - "system-configuration", + "sync_wrapper 1.0.1", "tokio", "tokio-rustls", "tokio-util", @@ -2312,7 +2587,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "winreg", + "winreg 0.52.0", ] [[package]] @@ -2342,6 +2617,12 @@ version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustix" version = "0.38.31" @@ -2357,37 +2638,31 @@ dependencies = [ [[package]] name = "rustls" -version = "0.21.11" +version = "0.23.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4" +checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b" dependencies = [ - "log", + "once_cell", "ring", + "rustls-pki-types", "rustls-webpki", - "sct", + "subtle", + "zeroize", ] [[package]] name = "rustls-native-certs" -version = "0.6.3" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" +checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792" dependencies = [ "openssl-probe", - "rustls-pemfile 1.0.4", + "rustls-pemfile", + "rustls-pki-types", "schannel", "security-framework", ] -[[package]] -name = "rustls-pemfile" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" -dependencies = [ - "base64 0.21.7", -] - [[package]] name = "rustls-pemfile" version = "2.1.2" @@ -2406,11 +2681,12 @@ checksum = "ecd36cc4259e3e4514335c4a138c6b43171a8d61d8f5c9348f9fc7529416f247" [[package]] name = "rustls-webpki" -version = "0.101.7" +version = "0.102.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" +checksum = "f3bce581c0dd41bce533ce695a1437fa16a7ab5ac3ccfa99fe1a620a7885eabf" dependencies = [ "ring", + "rustls-pki-types", "untrusted", ] @@ -2456,16 +2732,6 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "sct" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "security-framework" version = "2.10.0" @@ -2654,9 +2920,9 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.39.0" +version = "0.47.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "743b4dc2cbde11890ccb254a8fc9d537fa41b36da00de2a1c5e9848c9bc42bd7" +checksum = "295e9930cd7a97e58ca2a070541a3ca502b17f5d1fa7157376d0fabd85324f25" dependencies = [ "log", ] @@ -2742,6 +3008,12 @@ dependencies = [ "syn 2.0.50", ] +[[package]] +name = "subtle" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d0208408ba0c3df17ed26eb06992cb1a1268d41b2c0e12e65203fbe3972cee5" + [[package]] name = "syn" version = "1.0.109" @@ -2770,6 +3042,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +[[package]] +name = "sync_wrapper" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" + [[package]] name = "sysinfo" version = "0.30.5" @@ -2898,11 +3176,12 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.24.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ "rustls", + "rustls-pki-types", "tokio", ] @@ -2920,6 +3199,27 @@ dependencies = [ "tracing", ] +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "pin-project", + "pin-project-lite", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" + [[package]] name = "tower-service" version = "0.3.2" @@ -3406,6 +3706,16 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "winreg" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + [[package]] name = "xxhash-rust" version = "0.8.10" @@ -3432,6 +3742,12 @@ dependencies = [ "syn 2.0.50", ] +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" + [[package]] name = "zstd" version = "0.13.0" diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index a042f4fab..15c868f2c 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -50,8 +50,8 @@ serde_json = "*" smartstring = "1.0.1" state = "0.6.0" thiserror = "1.0.61" -polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "318ec405632410a41f634de7aeff46e89a25eab9", default-features = false } -polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "318ec405632410a41f634de7aeff46e89a25eab9", default-features = false } +polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "21602ba8610ce16e7ba675998e84e238d8066c98", default-features = false } +polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "21602ba8610ce16e7ba675998e84e238d8066c98", default-features = false } either = "1" [dependencies.polars] @@ -76,7 +76,6 @@ features = [ "cumulative_eval", "cutqcut", "dataframe_arithmetic", - "date_offset", "decompress-fast", "diagonal_concat", "diff", @@ -157,4 +156,4 @@ features = [ "zip_with", ] git = "https://github.com/pola-rs/polars.git" -rev = "318ec405632410a41f634de7aeff46e89a25eab9" +rev = "21602ba8610ce16e7ba675998e84e238d8066c98" diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index 53a6d901b..46a794163 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -13,7 +13,7 @@ use crate::utils::{r_result_list, try_f64_into_usize}; use extendr_api::prelude::*; use pl::{AsOfOptions, Duration, RollingGroupOptions}; use polars::chunked_array::ops::SortMultipleOptions; -use polars::frame::explode::MeltArgs; +use polars::frame::explode::UnpivotArgs; use polars::prelude as pl; use polars::prelude::{JoinCoalesce, SerializeOptions}; @@ -499,22 +499,22 @@ impl RPolarsLazyFrame { .into()) } - fn melt( + fn unpivot( &self, - id_vars: Robj, - value_vars: Robj, + on: Robj, + index: Robj, value_name: Robj, variable_name: Robj, streamable: Robj, ) -> RResult { - let args = MeltArgs { - id_vars: strings_to_smartstrings(robj_to!(Vec, String, id_vars)?), - value_vars: strings_to_smartstrings(robj_to!(Vec, String, value_vars)?), + let args = UnpivotArgs { + on: strings_to_smartstrings(robj_to!(Vec, String, on)?), + index: strings_to_smartstrings(robj_to!(Vec, String, index)?), value_name: robj_to!(Option, String, value_name)?.map(|s| s.into()), variable_name: robj_to!(Option, String, variable_name)?.map(|s| s.into()), streamable: robj_to!(bool, streamable)?, }; - Ok(self.0.clone().melt(args).into()) + Ok(self.0.clone().unpivot(args).into()) } fn rename(&self, existing: Robj, new: Robj) -> RResult { @@ -590,6 +590,7 @@ impl RPolarsLazyFrame { fast_projection: _, row_estimate: _, eager, + new_streaming: _, } = self.0.get_current_optimizations(); list!( type_coercion = type_coercion, diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index d6d8452e8..b5f6b1867 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -25,7 +25,7 @@ use polars_core::utils::arrow; use crate::utils::{collect_hinted_result, r_result_list}; use crate::conversion::strings_to_smartstrings; -use polars::frame::explode::MeltArgs; +use polars::frame::explode::UnpivotArgs; use polars::prelude::pivot::{pivot, pivot_stable}; pub struct OwnedDataFrameIterator { @@ -382,23 +382,25 @@ impl RPolarsDataFrame { self.0.clone().null_count().into() } - fn melt( + fn unpivot( &self, - id_vars: Robj, - value_vars: Robj, + on: Robj, + index: Robj, value_name: Robj, variable_name: Robj, + streamable: Robj, ) -> RResult { - let args = MeltArgs { - id_vars: strings_to_smartstrings(robj_to!(Vec, String, id_vars)?), - value_vars: strings_to_smartstrings(robj_to!(Vec, String, value_vars)?), + let args = UnpivotArgs { + on: strings_to_smartstrings(robj_to!(Vec, String, on)?), + index: strings_to_smartstrings(robj_to!(Vec, String, index)?), value_name: robj_to!(Option, String, value_name)?.map(|s| s.into()), variable_name: robj_to!(Option, String, variable_name)?.map(|s| s.into()), - streamable: false, + streamable: robj_to!(bool, streamable)?, }; - self.0 - .melt2(args) + let ldf = self.0.clone(); + + ldf.unpivot(args) .map_err(polars_to_rpolars_err) .map(RPolarsDataFrame) } From 9949b79ba553c70d94a105105559e2a6b41ce688 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 12:45:21 +0200 Subject: [PATCH 02/47] fix read_json --- src/rust/src/rdataframe/read_ndjson.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rust/src/rdataframe/read_ndjson.rs b/src/rust/src/rdataframe/read_ndjson.rs index a2d51d1a3..6e3091d86 100644 --- a/src/rust/src/rdataframe/read_ndjson.rs +++ b/src/rust/src/rdataframe/read_ndjson.rs @@ -38,7 +38,7 @@ pub fn new_from_ndjson( }?; linereader - .with_infer_schema_length(robj_to!(Option, usize, infer_schema_length)?) + .with_infer_schema_length(robj_to!(Option, nonzero_usize, infer_schema_length)?) .with_batch_size(robj_to!(Option, nonzero_usize, batch_size)?) .with_n_rows(robj_to!(Option, usize, n_rows)?) .low_memory(robj_to!(bool, low_memory)?) From 071a5baffb9916a2029da3d8fa4562c08ddaaa31 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 12:47:00 +0200 Subject: [PATCH 03/47] fix feature names [skip ci] --- src/rust/Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index 15c868f2c..f81b5efe8 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -115,8 +115,11 @@ features = [ "meta", "mode", "moment", + "month_start", + "month_end", "ndarray", "object", + "offset_by", "parquet", "partition_by", "pct_change", From b5454442f1c347129becabbb67d0cbc5bcc5247a Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 12:49:57 +0200 Subject: [PATCH 04/47] remove offset arg from dt_truncate and dt_round [skip ci] --- src/rust/src/lazy/dsl.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 129f1398e..6a54478f4 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -1460,22 +1460,17 @@ impl RPolarsExpr { // datetime methods - pub fn dt_truncate(&self, every: Robj, offset: String) -> RResult { + pub fn dt_truncate(&self, every: Robj) -> RResult { Ok(self .0 .clone() .dt() - .truncate(robj_to!(PLExpr, every)?, offset) + .truncate(robj_to!(PLExpr, every)?) .into()) } - pub fn dt_round(&self, every: Robj, offset: &str) -> RResult { - Ok(self - .0 - .clone() - .dt() - .round(robj_to!(PLExpr, every)?, offset) - .into()) + pub fn dt_round(&self, every: Robj) -> RResult { + Ok(self.0.clone().dt().round(robj_to!(PLExpr, every)?).into()) } pub fn dt_time(&self) -> RResult { From f983eb359b8b6193655e9ff8df2800dfe45b7152 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 12:52:48 +0200 Subject: [PATCH 05/47] add arg normalize in value_counts [skip ci] --- src/rust/src/lazy/dsl.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 6a54478f4..9c6381c85 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -1081,8 +1081,11 @@ impl RPolarsExpr { } } - pub fn value_counts(&self, sort: bool, parallel: bool, name: String) -> Self { - self.0.clone().value_counts(sort, parallel, name).into() + pub fn value_counts(&self, sort: bool, parallel: bool, name: String, normalize: bool) -> Self { + self.0 + .clone() + .value_counts(sort, parallel, name, normalize) + .into() } pub fn unique_counts(&self) -> Self { From 7fe3b26bcb09610f91c4d38ba6ca967a39f21290 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 12:52:59 +0200 Subject: [PATCH 06/47] fix [skip ci] --- src/rust/src/rdataframe/mod.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index b5f6b1867..4b61646b0 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -398,9 +398,8 @@ impl RPolarsDataFrame { streamable: robj_to!(bool, streamable)?, }; - let ldf = self.0.clone(); - - ldf.unpivot(args) + self.0 + .unpivot(args) .map_err(polars_to_rpolars_err) .map(RPolarsDataFrame) } From a46d2f75376e243a683d5e992a8e98b345cdbd60 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 12:56:08 +0200 Subject: [PATCH 07/47] remove most args of top_k and bottom_k --- src/rust/src/lazy/dsl.rs | 46 ++++------------------------------------ 1 file changed, 4 insertions(+), 42 deletions(-) diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 9c6381c85..fbde29d74 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -261,50 +261,12 @@ impl RPolarsExpr { .into() } - pub fn top_k( - &self, - k: Robj, - nulls_last: Robj, - maintain_order: Robj, - multithreaded: Robj, - ) -> RResult { - let nulls_last = robj_to!(bool, nulls_last)?; - let multithreaded = robj_to!(bool, multithreaded)?; - let maintain_order = robj_to!(bool, maintain_order)?; - Ok(self - .0 - .clone() - .top_k( - robj_to!(PLExpr, k)?, - SortOptions::default() - .with_nulls_last(nulls_last) - .with_maintain_order(maintain_order) - .with_multithreaded(multithreaded), - ) - .into()) + pub fn top_k(&self, k: Robj) -> RResult { + Ok(self.0.clone().top_k(robj_to!(PLExpr, k)?).into()) } - pub fn bottom_k( - &self, - k: Robj, - nulls_last: Robj, - maintain_order: Robj, - multithreaded: Robj, - ) -> RResult { - let nulls_last = robj_to!(bool, nulls_last)?; - let multithreaded = robj_to!(bool, multithreaded)?; - let maintain_order = robj_to!(bool, maintain_order)?; - Ok(self - .0 - .clone() - .bottom_k( - robj_to!(PLExpr, k)?, - SortOptions::default() - .with_nulls_last(nulls_last) - .with_maintain_order(maintain_order) - .with_multithreaded(multithreaded), - ) - .into()) + pub fn bottom_k(&self, k: Robj) -> RResult { + Ok(self.0.clone().bottom_k(robj_to!(PLExpr, k)?).into()) } pub fn arg_max(&self) -> Self { From 48dac1aa54b9dfe28c7ff2cb4bbd332fd12ee0fa Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 12:58:05 +0200 Subject: [PATCH 08/47] remove time related args from date_range functions [skip ci] --- src/rust/src/rlib.rs | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/src/rust/src/rlib.rs b/src/rust/src/rlib.rs index 001b1b3a6..10df4c0e7 100644 --- a/src/rust/src/rlib.rs +++ b/src/rust/src/rlib.rs @@ -86,40 +86,22 @@ fn concat_str(dotdotdot: Robj, separator: Robj, ignore_nulls: Robj) -> RResult RResult { +fn date_range(start: Robj, end: Robj, interval: &str, closed: Robj) -> RResult { Ok(RPolarsExpr(polars::lazy::prelude::date_range( robj_to!(PLExprCol, start)?, robj_to!(PLExprCol, end)?, pl::Duration::parse(interval), robj_to!(ClosedWindow, closed)?, - robj_to!(Option, timeunit, time_unit)?, - robj_to!(Option, String, time_zone)?, ))) } #[extendr] -fn date_ranges( - start: Robj, - end: Robj, - interval: &str, - closed: Robj, - time_unit: Robj, - time_zone: Robj, -) -> RResult { +fn date_ranges(start: Robj, end: Robj, interval: &str, closed: Robj) -> RResult { Ok(RPolarsExpr(polars::lazy::prelude::date_ranges( robj_to!(PLExprCol, start)?, robj_to!(PLExprCol, end)?, pl::Duration::parse(interval), robj_to!(ClosedWindow, closed)?, - robj_to!(Option, timeunit, time_unit)?, - robj_to!(Option, String, time_zone)?, ))) } From d5c84839e1cf81bce4996e4e7343cebf23bfc333 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 13:02:43 +0200 Subject: [PATCH 09/47] add normalize arg in value_counts for series too [skip ci] --- src/rust/src/series.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/rust/src/series.rs b/src/rust/src/series.rs index 85fc0ddae..35174814b 100644 --- a/src/rust/src/series.rs +++ b/src/rust/src/series.rs @@ -166,9 +166,10 @@ impl RPolarsSeries { sort: bool, parallel: bool, name: String, + normalize: bool, ) -> std::result::Result { self.0 - .value_counts(sort, parallel, name) + .value_counts(sort, parallel, name, normalize) .map(RPolarsDataFrame) .map_err(|err| format!("in value_counts: {:?}", err)) } From 03289f409d0139538ffb90eabdaeb42399722d86 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 13:14:46 +0200 Subject: [PATCH 10/47] fix series arithmetic [skip ci] --- src/rust/src/series.rs | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/rust/src/series.rs b/src/rust/src/series.rs index 35174814b..13184609c 100644 --- a/src/rust/src/series.rs +++ b/src/rust/src/series.rs @@ -366,24 +366,34 @@ impl RPolarsSeries { ) } - pub fn add(&self, other: &RPolarsSeries) -> Self { - (&self.0 + &other.0).into() + pub fn add(&self, other: &RPolarsSeries) -> RResult { + Ok((&self.0 + &other.0) + .map(Into::into) + .map_err(polars_to_rpolars_err)?) } - pub fn sub(&self, other: &RPolarsSeries) -> Self { - (&self.0 - &other.0).into() + pub fn sub(&self, other: &RPolarsSeries) -> RResult { + Ok((&self.0 - &other.0) + .map(Into::into) + .map_err(polars_to_rpolars_err)?) } - pub fn mul(&self, other: &RPolarsSeries) -> Self { - (&self.0 * &other.0).into() + pub fn mul(&self, other: &RPolarsSeries) -> RResult { + Ok((&self.0 * &other.0) + .map(Into::into) + .map_err(polars_to_rpolars_err)?) } - pub fn div(&self, other: &RPolarsSeries) -> Self { - (&self.0 / &other.0).into() + pub fn div(&self, other: &RPolarsSeries) -> RResult { + Ok((&self.0 / &other.0) + .map(Into::into) + .map_err(polars_to_rpolars_err)?) } - pub fn rem(&self, other: &RPolarsSeries) -> Self { - (&self.0 % &other.0).into() + pub fn rem(&self, other: &RPolarsSeries) -> RResult { + Ok((&self.0 % &other.0) + .map(Into::into) + .map_err(polars_to_rpolars_err)?) } pub fn map_elements( From fa4f382694974da67fc8aed1720371ea9a819c1f Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 13:17:42 +0200 Subject: [PATCH 11/47] split replace and replace_strict [skip ci] --- src/rust/src/lazy/dsl.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index fbde29d74..9ea29d735 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -1087,7 +1087,15 @@ impl RPolarsExpr { self.0.clone().peak_max().into() } - pub fn replace( + pub fn replace(&self, old: Robj, new: Robj) -> RResult { + Ok(self + .0 + .clone() + .replace(robj_to!(PLExpr, old)?, robj_to!(PLExpr, new)?) + .into()) + } + + pub fn replace_strict( &self, old: Robj, new: Robj, @@ -1097,7 +1105,7 @@ impl RPolarsExpr { Ok(self .0 .clone() - .replace( + .replace_strict( robj_to!(PLExpr, old)?, robj_to!(PLExpr, new)?, robj_to!(Option, PLExpr, default)?, From ab2b68cb94a00dcbce1280d38544efe0edba1106 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 13:19:21 +0200 Subject: [PATCH 12/47] remove unnecessary unsafe tag [skip ci] --- src/rust/src/conversion_s_to_r.rs | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/rust/src/conversion_s_to_r.rs b/src/rust/src/conversion_s_to_r.rs index 09c936ea8..8199148e1 100644 --- a/src/rust/src/conversion_s_to_r.rs +++ b/src/rust/src/conversion_s_to_r.rs @@ -120,22 +120,17 @@ pub fn pl_series_to_list( let mut v: Vec = Vec::with_capacity(s.len()); let ca = s.list().unwrap(); - // Safty:amortized_iter() The returned should never be cloned or taken longer than a single iteration, - // as every call on next of the iterator will change the contents of that Series. - unsafe { - for opt_s in ca.amortized_iter() { - match opt_s { - Some(s) => { - let s_ref = s.as_ref(); - // is safe because s is read to generate new Robj, then discarded. - let inner_val = - to_list_recursive(s_ref, tag_structs, int64_conversion)?; - v.push(inner_val); - } + for opt_s in ca.amortized_iter() { + match opt_s { + Some(s) => { + let s_ref = s.as_ref(); + let inner_val = + to_list_recursive(s_ref, tag_structs, int64_conversion)?; + v.push(inner_val); + } - None => { - v.push(r!(extendr_api::NULL)); - } + None => { + v.push(r!(extendr_api::NULL)); } } } From f45ecbb0ec7a72d93ad08ea4a129c9d540a3854d Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 13:30:19 +0200 Subject: [PATCH 13/47] add order_by args to over [skip ci] --- src/rust/src/lazy/dsl.rs | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 9ea29d735..49809bf34 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -1903,14 +1903,33 @@ impl RPolarsExpr { .into()) } - pub fn over(&self, partition_by: Robj, mapping: Robj) -> RResult { + pub fn over( + &self, + partition_by: Robj, + order_by: Robj, + order_by_descending: bool, + order_by_nulls_last: bool, + mapping: Robj, + ) -> RResult { + let partition_by = robj_to!(Vec, PLExpr, partition_by)?; + + let order_by = robj_to!(Option, Vec, PLExpr, order_by)?.map(|order_by| { + ( + order_by, + SortOptions { + descending: order_by_descending, + nulls_last: order_by_nulls_last, + maintain_order: false, + ..Default::default() + }, + ) + }); + + let mapping = robj_to!(WindowMapping, mapping)?; Ok(self .0 .clone() - .over_with_options( - robj_to!(Vec, PLExpr, partition_by)?, - robj_to!(WindowMapping, mapping)?, - ) + .over_with_options(partition_by, order_by, mapping) .into()) } From b58b0057f2f099e55ad5909acb33dd92161aa627 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 13:34:16 +0200 Subject: [PATCH 14/47] fix compil for map_batches variants --- src/rust/src/lazy/dsl.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 49809bf34..197ce32b7 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -1951,8 +1951,8 @@ impl RPolarsExpr { // set expected type of output from R function let ot = robj_to!(Option, PLPolarsDataType, output_type)?; let output_map = pl::GetOutput::map_field(move |fld| match ot { - Some(ref dt) => pl::Field::new(fld.name(), dt.clone()), - None => fld.clone(), + Some(ref dt) => Ok(pl::Field::new(fld.name(), dt.clone())), + None => Ok(fld.clone()), }); robj_to!(bool, agg_list) @@ -1985,8 +1985,8 @@ impl RPolarsExpr { let ot = robj_to!(Option, PLPolarsDataType, output_type)?; let output_map = pl::GetOutput::map_field(move |fld| match ot { - Some(ref dt) => pl::Field::new(fld.name(), dt.clone()), - None => fld.clone(), + Some(ref dt) => Ok(pl::Field::new(fld.name(), dt.clone())), + None => Ok(fld.clone()), }); robj_to!(bool, agg_list) @@ -2018,8 +2018,8 @@ impl RPolarsExpr { let ot = null_to_opt(output_type).map(|rdt| rdt.0.clone()); let output_map = pl::GetOutput::map_field(move |fld| match ot { - Some(ref dt) => pl::Field::new(fld.name(), dt.clone()), - None => fld.clone(), + Some(ref dt) => Ok(pl::Field::new(fld.name(), dt.clone())), + None => Ok(fld.clone()), }); self.0.clone().apply(rbgfunc, output_map).into() From cc9e6ca10da938cad27b9adc30cc875c02cda7af Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 13:34:41 +0200 Subject: [PATCH 15/47] rename str_concat to str_join [skip ci] --- src/rust/src/lazy/dsl.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 197ce32b7..d39975449 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -2118,12 +2118,12 @@ impl RPolarsExpr { self.clone().0.str().len_chars().into() } - pub fn str_concat(&self, delimiter: Robj, ignore_nulls: Robj) -> RResult { + pub fn str_join(&self, delimiter: Robj, ignore_nulls: Robj) -> RResult { Ok(self .0 .clone() .str() - .concat(robj_to!(str, delimiter)?, robj_to!(bool, ignore_nulls)?) + .join(robj_to!(str, delimiter)?, robj_to!(bool, ignore_nulls)?) .into()) } From e9ff1532279cca13c576cfc4684d869e3ae78fff Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 14:10:54 +0200 Subject: [PATCH 16/47] robj_to_statistics_options [skip ci] --- src/rust/src/lazy/dataframe.rs | 2 +- src/rust/src/rdataframe/mod.rs | 2 +- src/rust/src/rdatatype.rs | 17 +++++++++++++++++ src/rust/src/utils/mod.rs | 4 ++++ 4 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index 46a794163..c2feb00dc 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -108,7 +108,7 @@ impl RPolarsLazyFrame { ) -> RResult<()> { let pqwo = polars::prelude::ParquetWriteOptions { compression: new_parquet_compression(compression_method, compression_level)?, - statistics: robj_to!(bool, statistics)?, + statistics: robj_to!(StatisticsOptions, statistics)?, row_group_size: robj_to!(Option, usize, row_group_size)?, data_pagesize_limit: robj_to!(Option, usize, data_pagesize_limit)?, maintain_order: robj_to!(bool, maintain_order)?, diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index 4b61646b0..5b14e8df8 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -575,7 +575,7 @@ impl RPolarsDataFrame { compression_method, compression_level, )?) - .with_statistics(robj_to!(bool, statistics)?) + .with_statistics(robj_to!(StatisticsOptions, statistics)?) .with_row_group_size(robj_to!(Option, usize, row_group_size)?) .with_data_page_size(robj_to!(Option, usize, data_pagesize_limit)?) .set_parallel(true) diff --git a/src/rust/src/rdatatype.rs b/src/rust/src/rdatatype.rs index 8b25d9f6f..7b46dd80f 100644 --- a/src/rust/src/rdatatype.rs +++ b/src/rust/src/rdatatype.rs @@ -720,6 +720,23 @@ pub fn robj_new_null_behavior(robj: Robj) -> RResult RResult { + use pl::StatisticsOptions as SO; + let hm = robj + .as_list() + .unwrap() + .into_hashmap() + .into_iter() + .map(|xi| (xi.0, xi.1.as_bool().unwrap())) + .collect::>(); + let mut out = SO::default(); + out.min_value = *hm.get(&"min_value").unwrap(); + out.max_value = *hm.get(&"max_value").unwrap(); + out.distinct_count = *hm.get(&"distinct_count").unwrap(); + out.null_count = *hm.get(&"null_count").unwrap(); + Ok(out) +} + pub fn parse_fill_null_strategy( strategy: &str, limit: Option, diff --git a/src/rust/src/utils/mod.rs b/src/rust/src/utils/mod.rs index 78359950b..e72b0cb28 100644 --- a/src/rust/src/utils/mod.rs +++ b/src/rust/src/utils/mod.rs @@ -1087,6 +1087,10 @@ macro_rules! robj_to_inner { (WindowMapping, $a:ident) => { $crate::rdatatype::robj_to_window_mapping($a) }; + + (StatisticsOptions, $a:ident) => { + $crate::rdatatype::robj_to_statistics_options($a) + }; } //convert any Robj to appropriate rust type with informative error Strings From ea0b2f16d42e787c9e0ba5469547826a3c1441a6 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 14:27:38 +0200 Subject: [PATCH 17/47] fix compil for parquet [skip ci] --- src/rust/src/rdataframe/read_parquet.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/rust/src/rdataframe/read_parquet.rs b/src/rust/src/rdataframe/read_parquet.rs index 07ac9578f..339f3b5b2 100644 --- a/src/rust/src/rdataframe/read_parquet.rs +++ b/src/rust/src/rdataframe/read_parquet.rs @@ -42,8 +42,9 @@ pub fn new_from_parquet( cloud_options, use_statistics: robj_to!(bool, use_statistics)?, hive_options: polars::io::HiveOptions { - enabled: robj_to!(bool, hive_partitioning)?, - schema: None, // TODO: implement a option to set this + enabled: robj_to!(Option, bool, hive_partitioning)?, + hive_start_idx: 0, // TODO: is it actually 0? + schema: None, // TODO: implement a option to set this }, glob: robj_to!(bool, glob)?, }; From 51e612f93ac073297ae931636dc734022a0b404a Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 14:28:49 +0200 Subject: [PATCH 18/47] fix unpivot [skip ci] --- src/rust/src/rdataframe/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index 5b14e8df8..a8dace4cb 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -399,7 +399,7 @@ impl RPolarsDataFrame { }; self.0 - .unpivot(args) + .unpivot2(args) .map_err(polars_to_rpolars_err) .map(RPolarsDataFrame) } From 934781ac4fbcba00c7237116fa72e46af792eb15 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 14:43:29 +0200 Subject: [PATCH 19/47] bump to 0.41.1 [skip ci] --- src/rust/Cargo.lock | 72 +++++++++++++++++----------------- src/rust/Cargo.toml | 6 +-- src/rust/src/rdataframe/mod.rs | 6 +-- 3 files changed, 42 insertions(+), 42 deletions(-) diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index 5b0219634..66a49ee73 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -1776,8 +1776,8 @@ dependencies = [ [[package]] name = "polars" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "getrandom", "polars-arrow", @@ -1796,8 +1796,8 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "ahash", "atoi", @@ -1843,8 +1843,8 @@ dependencies = [ [[package]] name = "polars-compute" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "bytemuck", "either", @@ -1858,8 +1858,8 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1892,8 +1892,8 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "avro-schema", "object_store", @@ -1905,8 +1905,8 @@ dependencies = [ [[package]] name = "polars-expr" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1924,8 +1924,8 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "ahash", "async-trait", @@ -1970,8 +1970,8 @@ dependencies = [ [[package]] name = "polars-json" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "ahash", "chrono", @@ -1990,8 +1990,8 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "ahash", "bitflags 2.4.2", @@ -2018,8 +2018,8 @@ dependencies = [ [[package]] name = "polars-mem-engine" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "futures", "polars-arrow", @@ -2038,8 +2038,8 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "ahash", "aho-corasick", @@ -2074,8 +2074,8 @@ dependencies = [ [[package]] name = "polars-parquet" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "ahash", "async-stream", @@ -2101,8 +2101,8 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -2128,8 +2128,8 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "ahash", "bytemuck", @@ -2158,8 +2158,8 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "bytemuck", "polars-arrow", @@ -2169,8 +2169,8 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "hex", "once_cell", @@ -2189,8 +2189,8 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "atoi", "bytemuck", @@ -2210,8 +2210,8 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.41.0" -source = "git+https://github.com/pola-rs/polars.git?rev=21602ba8610ce16e7ba675998e84e238d8066c98#21602ba8610ce16e7ba675998e84e238d8066c98" +version = "0.41.1" +source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" dependencies = [ "ahash", "bytemuck", diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index f81b5efe8..1a1ee34af 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -50,8 +50,8 @@ serde_json = "*" smartstring = "1.0.1" state = "0.6.0" thiserror = "1.0.61" -polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "21602ba8610ce16e7ba675998e84e238d8066c98", default-features = false } -polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "21602ba8610ce16e7ba675998e84e238d8066c98", default-features = false } +polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "c0871ef8f8bcbe2108c25137604502f462549b87", default-features = false } +polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "c0871ef8f8bcbe2108c25137604502f462549b87", default-features = false } either = "1" [dependencies.polars] @@ -159,4 +159,4 @@ features = [ "zip_with", ] git = "https://github.com/pola-rs/polars.git" -rev = "21602ba8610ce16e7ba675998e84e238d8066c98" +rev = "c0871ef8f8bcbe2108c25137604502f462549b87" diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index a8dace4cb..bc2302c3c 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -407,8 +407,8 @@ impl RPolarsDataFrame { #[allow(clippy::too_many_arguments)] pub fn pivot_expr( &self, + on: Robj, index: Robj, - columns: Robj, values: Robj, maintain_order: Robj, sort_columns: Robj, @@ -423,8 +423,8 @@ impl RPolarsDataFrame { fun( &self.0, - robj_to!(Vec, String, index)?, - robj_to!(Vec, String, columns)?, + robj_to!(Vec, String, on)?, + robj_to!(Option, Vec, String, index)?, robj_to!(Option, Vec, String, values)?, robj_to!(bool, sort_columns)?, robj_to!(Option, PLExpr, aggregate_expr)?, From 700afd649be722ba98382852ce9cea81aa209ab1 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 15:46:22 +0200 Subject: [PATCH 20/47] remove args, rename funs [skip ci] --- R/dataframe__frame.R | 17 ++- R/expr__datetime.R | 42 +---- R/expr__expr.R | 48 +++--- R/expr__string.R | 15 +- R/extendr-wrappers.R | 30 ++-- R/functions__eager.R | 48 +----- R/lazyframe__lazy.R | 22 +-- ...DataFrame_melt.Rd => DataFrame_unpivot.Rd} | 21 +-- man/ExprDT_round.Rd | 16 +- man/ExprDT_truncate.Rd | 9 +- man/{ExprStr_concat.Rd => ExprStr_join.Rd} | 10 +- man/Expr_bottom_k.Rd | 19 +-- man/Expr_over.Rd | 5 +- man/Expr_top_k.Rd | 19 +-- man/Expr_value_counts.Rd | 16 +- ...LazyFrame_melt.Rd => LazyFrame_unpivot.Rd} | 24 +-- man/Series_value_counts.Rd | 4 +- man/pl_date_range.Rd | 18 +-- man/pl_date_ranges.Rd | 18 +-- tests/testthat/_snaps/after-wrappers.md | 144 +++++++++--------- tests/testthat/test-expr_expr.R | 12 ++ 21 files changed, 227 insertions(+), 330 deletions(-) rename man/{DataFrame_melt.Rd => DataFrame_unpivot.Rd} (77%) rename man/{ExprStr_concat.Rd => ExprStr_join.Rd} (77%) rename man/{LazyFrame_melt.Rd => LazyFrame_unpivot.Rd} (80%) diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 819a47efe..e90b8f089 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1490,7 +1490,7 @@ DataFrame_join_asof = function( -#' @inherit LazyFrame_melt +#' @inherit LazyFrame_unpivot #' @keywords DataFrame #' #' @return A new `DataFrame` @@ -1502,16 +1502,17 @@ DataFrame_join_asof = function( #' c = c(2, 4, 6), #' d = c(7, 8, 9) #' ) -#' df$melt(id_vars = "a", value_vars = c("b", "c", "d")) -DataFrame_melt = function( - id_vars = NULL, - value_vars = NULL, +#' df$unpivot(index = "a", on = c("b", "c", "d")) +DataFrame_unpivot = function( + on = NULL, + ..., + index = NULL, variable_name = NULL, value_name = NULL) { - .pr$DataFrame$melt( - self, id_vars %||% character(), value_vars %||% character(), + .pr$DataFrame$unpivot( + self, on %||% character(), index %||% character(), value_name, variable_name - ) |> unwrap("in $melt( ): ") + ) |> unwrap("in $unpivot( ): ") } diff --git a/R/expr__datetime.R b/R/expr__datetime.R index b2e04cfad..5bbbc733d 100644 --- a/R/expr__datetime.R +++ b/R/expr__datetime.R @@ -2,8 +2,8 @@ #' @description Divide the date/datetime range into buckets. #' Each date/datetime is mapped to the start of its bucket. #' -#' @param every string encoding duration see details. -#' @param offset optional string encoding duration see details. +#' @param every Either an Expr or a string indicating a column name or a +#' duration (see Details). #' #' @details The ``every`` and ``offset`` argument are created with the #' the following string language: @@ -20,8 +20,6 @@ #' These strings can be combined: #' - 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds #' @return Date/Datetime expr -#' @keywords ExprDT -#' @aliases (Expr)$dt$truncate #' @examples #' t1 = as.POSIXct("3040-01-01", tz = "GMT") #' t2 = t1 + as.difftime(25, units = "secs") @@ -33,9 +31,9 @@ #' pl$col("datetime")$dt$truncate("4s", offset("3s"))$alias("truncated_4s_offset_2s") #' ) #' df -ExprDT_truncate = function(every, offset = NULL) { - offset = parse_as_polars_duration_string(offset, default = "0ns") - .pr$Expr$dt_truncate(self, every, offset) |> +ExprDT_truncate = function(every) { + every = parse_as_polars_duration_string(every, default = "0ns") + .pr$Expr$dt_truncate(self, every) |> unwrap("in $dt$truncate()") } @@ -46,31 +44,8 @@ ExprDT_truncate = function(every, offset = NULL) { #' Each date/datetime in the second half of the interval #' is mapped to the end of its bucket. #' +#' @inherit ExprDT_truncate params details return #' -#' @param every string encoding duration see details. -#' @param offset optional string encoding duration see details. -#' -#' @details The ``every`` and ``offset`` arguments are created with the -#' following string language: -#' - 1ns # 1 nanosecond -#' - 1us # 1 microsecond -#' - 1ms # 1 millisecond -#' - 1s # 1 second -#' - 1m # 1 minute -#' - 1h # 1 hour -#' - 1d # 1 day -#' - 1w # 1 calendar week -#' - 1mo # 1 calendar month -#' - 1y # 1 calendar year -#' These strings can be combined: -#' - 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds -#' -#' This functionality is currently experimental and may -#' change without it being considered a breaking change. -#' -#' @return Date/Datetime expr -#' @keywords ExprDT -#' @aliases (Expr)$dt$round #' @examples #' t1 = as.POSIXct("3040-01-01", tz = "GMT") #' t2 = t1 + as.difftime(25, units = "secs") @@ -82,10 +57,9 @@ ExprDT_truncate = function(every, offset = NULL) { #' pl$col("datetime")$dt$truncate("4s", offset("3s"))$alias("truncated_4s_offset_2s") #' ) #' df -ExprDT_round = function(every, offset = NULL) { +ExprDT_round = function(every) { every = parse_as_polars_duration_string(every, default = "0ns") - offset = parse_as_polars_duration_string(offset, default = "0ns") - .pr$Expr$dt_round(self, every, offset) |> + .pr$Expr$dt_round(self, every) |> unwrap("in $dt$round()") } diff --git a/R/expr__expr.R b/R/expr__expr.R index 2389b95ba..984dead61 100644 --- a/R/expr__expr.R +++ b/R/expr__expr.R @@ -1406,19 +1406,14 @@ Expr_sort = function(..., descending = FALSE, nulls_last = FALSE) { #' Return the `k` largest elements. This has time complexity: \eqn{ O(n + k #' \\log{}n - \frac{k}{2}) } #' -#' @param k Number of top values to get -#' @param ... Ignored. -#' @param nulls_last Place null values last. -#' @param maintain_order Whether the order should be maintained if elements are -#' equal. -#' @param multithreaded Sort using multiple threads. +#' @param k Number of top values to get. #' #' @return Expr #' @examples #' pl$DataFrame(a = c(6, 1, 0, NA, Inf, NaN))$select(pl$col("a")$top_k(5)) -Expr_top_k = function(k, ..., nulls_last = FALSE, maintain_order = FALSE, multithreaded = TRUE) { +Expr_top_k = function(k) { if (!is.numeric(k) || k < 0) stop("k must be numeric and positive, prefereably integerish") - .pr$Expr$top_k(self, k, nulls_last = nulls_last, maintain_order = maintain_order, multithreaded = multithreaded) |> + .pr$Expr$top_k(self, k) |> unwrap("in $top_k():") } @@ -1430,9 +1425,9 @@ Expr_top_k = function(k, ..., nulls_last = FALSE, maintain_order = FALSE, multit #' @inherit Expr_top_k params return #' @examples #' pl$DataFrame(a = c(6, 1, 0, NA, Inf, NaN))$select(pl$col("a")$bottom_k(5)) -Expr_bottom_k = function(k, ..., nulls_last = FALSE, maintain_order = FALSE, multithreaded = TRUE) { +Expr_bottom_k = function(k) { if (!is.numeric(k) || k < 0) stop("k must be numeric and positive, prefereably integerish") - .pr$Expr$bottom_k(self, k, nulls_last = nulls_last, maintain_order = maintain_order, multithreaded = multithreaded) |> + .pr$Expr$bottom_k(self, k) |> unwrap("in $bottom_k():") } @@ -1844,6 +1839,8 @@ Expr_last = use_extendr_wrapper #' #' @param ... Column(s) to group by. Accepts expression input. #' Characters are parsed as column names. +#' @param order_by Order the window functions/aggregations with the partitioned +#' groups by the result of the expression passed to `order_by`. #' @param mapping_strategy One of the following: #' * `"group_to_rows"` (default): if the aggregation results in multiple values, #' assign them back to their position in the DataFrame. This can only be done @@ -1889,7 +1886,7 @@ Expr_last = use_extendr_wrapper #' df$with_columns( #' top_2 = pl$col("c")$top_k(2)$over("a", mapping_strategy = "join") #' ) -Expr_over = function(..., mapping_strategy = "group_to_rows") { +Expr_over = function(..., order_by = NULL, mapping_strategy = "group_to_rows") { list_of_exprs = list2(...) |> lapply(\(x) { if (is.character(x)) { @@ -1907,7 +1904,7 @@ Expr_over = function(..., mapping_strategy = "group_to_rows") { } }) - .pr$Expr$over(self, list_of_exprs, mapping_strategy) |> + .pr$Expr$over(self, list_of_exprs, order_by, order_by_descending = FALSE, order_by_nulls_last = FALSE, mapping_strategy) |> unwrap("in $over():") } @@ -3307,16 +3304,25 @@ Expr_to_r = function(df = NULL, i = 0, ..., int64_conversion = polars_options()$ #' @param sort Ensure the output is sorted from most values to least. #' @param parallel Better to turn this off in the aggregation context, as it can #' lead to contention. -#' @param name Give the resulting count field a specific name, defaults to -#' `"count"`. -#' @format NULL -#' @examples -#' df = pl$DataFrame(iris)$select(pl$col("Species")$value_counts()) -#' df +#' @param name Give the resulting count column a specific name. The default is +#' `"count"` if `normalize = FALSE` and `"proportion"` if `normalize = TRUE`. +#' @param normalize If `TRUE`, it gives relative frequencies of the unique +#' values instead of their count. #' -#' df$unnest()$to_data_frame() -Expr_value_counts = function(..., sort = FALSE, parallel = FALSE, name = "count") { - .pr$Expr$value_counts(self, sort, parallel, name) +#' @examples +#' df = pl$DataFrame(iris) +#' df$select(pl$col("Species")$value_counts())$unnest() +#' df$select(pl$col("Species")$value_counts(normalize = TRUE))$unnest() +Expr_value_counts = function(..., sort = FALSE, parallel = FALSE, name, normalize = FALSE) { + if (missing(name)) { + if (isTRUE(normalize)) { + name = "proportion" + } else { + name = "count" + } + } + + .pr$Expr$value_counts(self, sort, parallel, name, normalize) } #' Count unique values diff --git a/R/expr__string.R b/R/expr__string.R index 2d858d9a5..5ff11a3d4 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -236,14 +236,23 @@ ExprStr_len_chars = function() { #' # concatenate a Series of strings to a single string #' df = pl$DataFrame(foo = c(1, NA, 2)) #' -#' df$select(pl$col("foo")$str$concat("-")) +#' df$select(pl$col("foo")$str$join("-")) #' -#' df$select(pl$col("foo")$str$concat("-", ignore_nulls = FALSE)) +#' df$select(pl$col("foo")$str$join("-", ignore_nulls = FALSE)) +ExprStr_join = function( + delimiter = "", + ..., + ignore_nulls = TRUE) { + .pr$Expr$str_join(self, delimiter, ignore_nulls) |> + unwrap("in $join():") +} + ExprStr_concat = function( delimiter = "", ..., ignore_nulls = TRUE) { - .pr$Expr$str_concat(self, delimiter, ignore_nulls) |> + warning("$str$concat() is deprecated as of 0.18.0. Use $str$join() instead.") + .pr$Expr$str_join(self, delimiter, ignore_nulls) |> unwrap("in $concat():") } diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index eeae33d8b..98c5e099e 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -48,9 +48,9 @@ fold <- function(acc, lambda, exprs) .Call(wrap__fold, acc, lambda, exprs) reduce <- function(lambda, exprs) .Call(wrap__reduce, lambda, exprs) -date_range <- function(start, end, interval, closed, time_unit, time_zone) .Call(wrap__date_range, start, end, interval, closed, time_unit, time_zone) +date_range <- function(start, end, interval, closed) .Call(wrap__date_range, start, end, interval, closed) -date_ranges <- function(start, end, interval, closed, time_unit, time_zone) .Call(wrap__date_ranges, start, end, interval, closed, time_unit, time_zone) +date_ranges <- function(start, end, interval, closed) .Call(wrap__date_ranges, start, end, interval, closed) datetime_range <- function(start, end, interval, closed, time_unit, time_zone) .Call(wrap__datetime_range, start, end, interval, closed, time_unit, time_zone) @@ -208,9 +208,9 @@ RPolarsDataFrame$estimated_size <- function() .Call(wrap__RPolarsDataFrame__esti RPolarsDataFrame$null_count <- function() .Call(wrap__RPolarsDataFrame__null_count, self) -RPolarsDataFrame$melt <- function(id_vars, value_vars, value_name, variable_name) .Call(wrap__RPolarsDataFrame__melt, self, id_vars, value_vars, value_name, variable_name) +RPolarsDataFrame$unpivot <- function(on, index, value_name, variable_name, streamable) .Call(wrap__RPolarsDataFrame__unpivot, self, on, index, value_name, variable_name, streamable) -RPolarsDataFrame$pivot_expr <- function(index, columns, values, maintain_order, sort_columns, aggregate_expr, separator) .Call(wrap__RPolarsDataFrame__pivot_expr, self, index, columns, values, maintain_order, sort_columns, aggregate_expr, separator) +RPolarsDataFrame$pivot_expr <- function(on, index, values, maintain_order, sort_columns, aggregate_expr, separator) .Call(wrap__RPolarsDataFrame__pivot_expr, self, on, index, values, maintain_order, sort_columns, aggregate_expr, separator) RPolarsDataFrame$sample_n <- function(n, with_replacement, shuffle, seed) .Call(wrap__RPolarsDataFrame__sample_n, self, n, with_replacement, shuffle, seed) @@ -530,9 +530,9 @@ RPolarsExpr$sort_with <- function(descending, nulls_last) .Call(wrap__RPolarsExp RPolarsExpr$arg_sort <- function(descending, nulls_last) .Call(wrap__RPolarsExpr__arg_sort, self, descending, nulls_last) -RPolarsExpr$top_k <- function(k, nulls_last, maintain_order, multithreaded) .Call(wrap__RPolarsExpr__top_k, self, k, nulls_last, maintain_order, multithreaded) +RPolarsExpr$top_k <- function(k) .Call(wrap__RPolarsExpr__top_k, self, k) -RPolarsExpr$bottom_k <- function(k, nulls_last, maintain_order, multithreaded) .Call(wrap__RPolarsExpr__bottom_k, self, k, nulls_last, maintain_order, multithreaded) +RPolarsExpr$bottom_k <- function(k) .Call(wrap__RPolarsExpr__bottom_k, self, k) RPolarsExpr$arg_max <- function() .Call(wrap__RPolarsExpr__arg_max, self) @@ -702,7 +702,7 @@ RPolarsExpr$extend_constant <- function(value, n) .Call(wrap__RPolarsExpr__exten RPolarsExpr$rep <- function(n, rechunk) .Call(wrap__RPolarsExpr__rep, self, n, rechunk) -RPolarsExpr$value_counts <- function(sort, parallel, name) .Call(wrap__RPolarsExpr__value_counts, self, sort, parallel, name) +RPolarsExpr$value_counts <- function(sort, parallel, name, normalize) .Call(wrap__RPolarsExpr__value_counts, self, sort, parallel, name, normalize) RPolarsExpr$unique_counts <- function() .Call(wrap__RPolarsExpr__unique_counts, self) @@ -718,7 +718,9 @@ RPolarsExpr$peak_min <- function() .Call(wrap__RPolarsExpr__peak_min, self) RPolarsExpr$peak_max <- function() .Call(wrap__RPolarsExpr__peak_max, self) -RPolarsExpr$replace <- function(old, new, default, return_dtype) .Call(wrap__RPolarsExpr__replace, self, old, new, default, return_dtype) +RPolarsExpr$replace <- function(old, new) .Call(wrap__RPolarsExpr__replace, self, old, new) + +RPolarsExpr$replace_strict <- function(old, new, default, return_dtype) .Call(wrap__RPolarsExpr__replace_strict, self, old, new, default, return_dtype) RPolarsExpr$rle <- function() .Call(wrap__RPolarsExpr__rle, self) @@ -812,9 +814,9 @@ RPolarsExpr$arr_to_struct <- function(fields) .Call(wrap__RPolarsExpr__arr_to_st RPolarsExpr$arr_shift <- function(n) .Call(wrap__RPolarsExpr__arr_shift, self, n) -RPolarsExpr$dt_truncate <- function(every, offset) .Call(wrap__RPolarsExpr__dt_truncate, self, every, offset) +RPolarsExpr$dt_truncate <- function(every) .Call(wrap__RPolarsExpr__dt_truncate, self, every) -RPolarsExpr$dt_round <- function(every, offset) .Call(wrap__RPolarsExpr__dt_round, self, every, offset) +RPolarsExpr$dt_round <- function(every) .Call(wrap__RPolarsExpr__dt_round, self, every) RPolarsExpr$dt_time <- function() .Call(wrap__RPolarsExpr__dt_time, self) @@ -992,7 +994,7 @@ RPolarsExpr$qcut <- function(probs, labels, left_closed, allow_duplicates, inclu RPolarsExpr$qcut_uniform <- function(n_bins, labels, left_closed, allow_duplicates, include_breaks) .Call(wrap__RPolarsExpr__qcut_uniform, self, n_bins, labels, left_closed, allow_duplicates, include_breaks) -RPolarsExpr$over <- function(partition_by, mapping) .Call(wrap__RPolarsExpr__over, self, partition_by, mapping) +RPolarsExpr$over <- function(partition_by, order_by, order_by_descending, order_by_nulls_last, mapping) .Call(wrap__RPolarsExpr__over, self, partition_by, order_by, order_by_descending, order_by_nulls_last, mapping) RPolarsExpr$print <- function() invisible(.Call(wrap__RPolarsExpr__print, self)) @@ -1024,7 +1026,7 @@ RPolarsExpr$str_len_bytes <- function() .Call(wrap__RPolarsExpr__str_len_bytes, RPolarsExpr$str_len_chars <- function() .Call(wrap__RPolarsExpr__str_len_chars, self) -RPolarsExpr$str_concat <- function(delimiter, ignore_nulls) .Call(wrap__RPolarsExpr__str_concat, self, delimiter, ignore_nulls) +RPolarsExpr$str_join <- function(delimiter, ignore_nulls) .Call(wrap__RPolarsExpr__str_join, self, delimiter, ignore_nulls) RPolarsExpr$str_to_uppercase <- function() .Call(wrap__RPolarsExpr__str_to_uppercase, self) @@ -1252,7 +1254,7 @@ RPolarsLazyFrame$join <- function(other, left_on, right_on, how, validate, join_ RPolarsLazyFrame$sort_by_exprs <- function(by, dotdotdot, descending, nulls_last, maintain_order, multithreaded) .Call(wrap__RPolarsLazyFrame__sort_by_exprs, self, by, dotdotdot, descending, nulls_last, maintain_order, multithreaded) -RPolarsLazyFrame$melt <- function(id_vars, value_vars, value_name, variable_name, streamable) .Call(wrap__RPolarsLazyFrame__melt, self, id_vars, value_vars, value_name, variable_name, streamable) +RPolarsLazyFrame$unpivot <- function(on, index, value_name, variable_name, streamable) .Call(wrap__RPolarsLazyFrame__unpivot, self, on, index, value_name, variable_name, streamable) RPolarsLazyFrame$rename <- function(existing, new) .Call(wrap__RPolarsLazyFrame__rename, self, existing, new) @@ -1326,7 +1328,7 @@ RPolarsSeries$name <- function() .Call(wrap__RPolarsSeries__name, self) RPolarsSeries$sort <- function(descending, nulls_last, multithreaded) .Call(wrap__RPolarsSeries__sort, self, descending, nulls_last, multithreaded) -RPolarsSeries$value_counts <- function(sort, parallel, name) .Call(wrap__RPolarsSeries__value_counts, self, sort, parallel, name) +RPolarsSeries$value_counts <- function(sort, parallel, name, normalize) .Call(wrap__RPolarsSeries__value_counts, self, sort, parallel, name, normalize) RPolarsSeries$arg_min <- function() .Call(wrap__RPolarsSeries__arg_min, self) diff --git a/R/functions__eager.R b/R/functions__eager.R index fc264e0d5..3f107f402 100644 --- a/R/functions__eager.R +++ b/R/functions__eager.R @@ -181,12 +181,6 @@ pl_concat = function( #' @param ... Ignored. #' @param closed Define which sides of the range are closed (inclusive). #' One of the followings: `"both"` (default), `"left"`, `"right"`, `"none"`. -#' @param time_unit Time unit of the resulting the [Datetime][DataType_Datetime] -#' data type. One of `"ns"`, `"us"`, `"ms"` or `NULL`. Only takes effect if the -#' output column is of type [Datetime][DataType_Datetime] (deprecated usage). -#' @param time_zone Time zone of the resulting [Datetime][DataType_Datetime] data -#' type. Only takes effect if the output column is of type [Datetime][DataType_Datetime] -#' (deprecated usage). #' @return An [Expr][Expr_class] of data type Date or [Datetime][DataType_Datetime] #' #' @inheritSection polars_duration_string Polars duration string language @@ -211,46 +205,12 @@ pl_date_range = function( end, interval = "1d", ..., - closed = "both", - time_unit = NULL, - time_zone = NULL) { - .warn_for_deprecated_date_range_use(start, end, interval, time_unit, time_zone) - + closed = "both") { interval = parse_as_polars_duration_string(interval) date_range(start, end, interval, closed, time_unit, time_zone) |> unwrap("in pl$date_range():") } - -.warn_for_deprecated_date_range_use = function( - start, - end, - interval, - time_unit = NULL, - time_zone = NULL) { - if ( - inherits(start, "POSIXt") || - inherits(end, "POSIXt") || - !is.null(time_unit) || - !is.null(time_zone) || - ( - is.character(interval) && - length(interval) == 1L && - (grepl("h", interval) || grepl("m", gsub("mo", "", interval)) || grepl("s", gsub("saturating", "", interval))) - ) - ) { - warning( - "Creating Datetime ranges using `pl$date_range()` is deprecated.", - "Use `pl$datetime_range()` instead.", - call. = FALSE - ) - } - - invisible(NULL) -} - - - # TODO: link to the Date type docs #' Generate a list containing a date range #' @@ -289,11 +249,7 @@ pl_date_ranges = function( end, interval = "1d", ..., - closed = "both", - time_unit = NULL, - time_zone = NULL) { - .warn_for_deprecated_date_range_use(start, end, interval, time_unit, time_zone) - + closed = "both") { interval = parse_as_polars_duration_string(interval) date_ranges(start, end, interval, closed, time_unit, time_zone) |> unwrap("in pl$date_ranges():") diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 4a5cf474b..02eba733d 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1489,14 +1489,14 @@ LazyFrame_join_asof = function( #' Unpivot a Frame from wide to long format #' -#' @param id_vars Columns to use as identifier variables. -#' @param value_vars Values to use as identifier variables. If `value_vars` is +#' @param on Values to use as identifier variables. If `value_vars` is #' empty all columns that are not in `id_vars` will be used. +#' @param ... Not used. +#' @param index Columns to use as identifier variables. #' @param variable_name Name to give to the new column containing the names of #' the melted columns. Defaults to "variable". #' @param value_name Name to give to the new column containing the values of #' the melted columns. Defaults to "value" -#' @param ... Not used. #' @param streamable Allow this node to run in the streaming engine. If this #' runs in streaming, the output of the melt operation will not have a stable #' ordering. @@ -1519,18 +1519,18 @@ LazyFrame_join_asof = function( #' b = c(1, 3, 5), #' c = c(2, 4, 6) #' ) -#' lf$melt(id_vars = "a", value_vars = c("b", "c"))$collect() -LazyFrame_melt = function( - id_vars = NULL, - value_vars = NULL, +#' lf$unpivot(index = "a", on = c("b", "c"))$collect() +LazyFrame_unpivot = function( + on = NULL, + ..., + index = NULL, variable_name = NULL, value_name = NULL, - ..., streamable = TRUE) { - .pr$LazyFrame$melt( - self, id_vars %||% character(), value_vars %||% character(), + .pr$LazyFrame$unpivot( + self, on %||% character(), index %||% character(), value_name, variable_name, streamable - ) |> unwrap("in $melt( ): ") + ) |> unwrap("in $unpivot( ): ") } #' Rename column names of a LazyFrame diff --git a/man/DataFrame_melt.Rd b/man/DataFrame_unpivot.Rd similarity index 77% rename from man/DataFrame_melt.Rd rename to man/DataFrame_unpivot.Rd index bf3098504..34e0f813c 100644 --- a/man/DataFrame_melt.Rd +++ b/man/DataFrame_unpivot.Rd @@ -1,22 +1,25 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/dataframe__frame.R -\name{DataFrame_melt} -\alias{DataFrame_melt} +\name{DataFrame_unpivot} +\alias{DataFrame_unpivot} \title{Unpivot a Frame from wide to long format} \usage{ -DataFrame_melt( - id_vars = NULL, - value_vars = NULL, +DataFrame_unpivot( + on = NULL, + ..., + index = NULL, variable_name = NULL, value_name = NULL ) } \arguments{ -\item{id_vars}{Columns to use as identifier variables.} - -\item{value_vars}{Values to use as identifier variables. If \code{value_vars} is +\item{on}{Values to use as identifier variables. If \code{value_vars} is empty all columns that are not in \code{id_vars} will be used.} +\item{...}{Not used.} + +\item{index}{Columns to use as identifier variables.} + \item{variable_name}{Name to give to the new column containing the names of the melted columns. Defaults to "variable".} @@ -44,6 +47,6 @@ df = pl$DataFrame( c = c(2, 4, 6), d = c(7, 8, 9) ) -df$melt(id_vars = "a", value_vars = c("b", "c", "d")) +df$unpivot(index = "a", on = c("b", "c", "d")) } \keyword{DataFrame} diff --git a/man/ExprDT_round.Rd b/man/ExprDT_round.Rd index 196acdc82..be51b35af 100644 --- a/man/ExprDT_round.Rd +++ b/man/ExprDT_round.Rd @@ -2,15 +2,13 @@ % Please edit documentation in R/expr__datetime.R \name{ExprDT_round} \alias{ExprDT_round} -\alias{(Expr)$dt$round} \title{Round datetime} \usage{ -ExprDT_round(every, offset = NULL) +ExprDT_round(every) } \arguments{ -\item{every}{string encoding duration see details.} - -\item{offset}{optional string encoding duration see details.} +\item{every}{Either an Expr or a string indicating a column name or a +duration (see Details).} } \value{ Date/Datetime expr @@ -23,8 +21,8 @@ Each date/datetime in the second half of the interval is mapped to the end of its bucket. } \details{ -The \code{every} and \code{offset} arguments are created with the -following string language: +The \code{every} and \code{offset} argument are created with the +the following string language: \itemize{ \item 1ns # 1 nanosecond \item 1us # 1 microsecond @@ -41,9 +39,6 @@ These strings can be combined: \item 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds } } - -This functionality is currently experimental and may -change without it being considered a breaking change. } \examples{ t1 = as.POSIXct("3040-01-01", tz = "GMT") @@ -57,4 +52,3 @@ df = pl$DataFrame(datetime = s)$with_columns( ) df } -\keyword{ExprDT} diff --git a/man/ExprDT_truncate.Rd b/man/ExprDT_truncate.Rd index 273c7efff..e177bd962 100644 --- a/man/ExprDT_truncate.Rd +++ b/man/ExprDT_truncate.Rd @@ -2,15 +2,13 @@ % Please edit documentation in R/expr__datetime.R \name{ExprDT_truncate} \alias{ExprDT_truncate} -\alias{(Expr)$dt$truncate} \title{Truncate datetime} \usage{ -ExprDT_truncate(every, offset = NULL) +ExprDT_truncate(every) } \arguments{ -\item{every}{string encoding duration see details.} - -\item{offset}{optional string encoding duration see details.} +\item{every}{Either an Expr or a string indicating a column name or a +duration (see Details).} } \value{ Date/Datetime expr @@ -51,4 +49,3 @@ df = pl$DataFrame(datetime = s)$with_columns( ) df } -\keyword{ExprDT} diff --git a/man/ExprStr_concat.Rd b/man/ExprStr_join.Rd similarity index 77% rename from man/ExprStr_concat.Rd rename to man/ExprStr_join.Rd index 1079dcd08..1c533d390 100644 --- a/man/ExprStr_concat.Rd +++ b/man/ExprStr_join.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/expr__string.R -\name{ExprStr_concat} -\alias{ExprStr_concat} +\name{ExprStr_join} +\alias{ExprStr_join} \title{Vertically concatenate the string values in the column to a single string value.} \usage{ -ExprStr_concat(delimiter = "", ..., ignore_nulls = TRUE) +ExprStr_join(delimiter = "", ..., ignore_nulls = TRUE) } \arguments{ \item{delimiter}{The delimiter to insert between consecutive string values.} @@ -24,7 +24,7 @@ Vertically concatenate the string values in the column to a single string value. # concatenate a Series of strings to a single string df = pl$DataFrame(foo = c(1, NA, 2)) -df$select(pl$col("foo")$str$concat("-")) +df$select(pl$col("foo")$str$join("-")) -df$select(pl$col("foo")$str$concat("-", ignore_nulls = FALSE)) +df$select(pl$col("foo")$str$join("-", ignore_nulls = FALSE)) } diff --git a/man/Expr_bottom_k.Rd b/man/Expr_bottom_k.Rd index 614baf8b9..16b803bf9 100644 --- a/man/Expr_bottom_k.Rd +++ b/man/Expr_bottom_k.Rd @@ -4,25 +4,10 @@ \alias{Expr_bottom_k} \title{Bottom k values} \usage{ -Expr_bottom_k( - k, - ..., - nulls_last = FALSE, - maintain_order = FALSE, - multithreaded = TRUE -) +Expr_bottom_k(k) } \arguments{ -\item{k}{Number of top values to get} - -\item{...}{Ignored.} - -\item{nulls_last}{Place null values last.} - -\item{maintain_order}{Whether the order should be maintained if elements are -equal.} - -\item{multithreaded}{Sort using multiple threads.} +\item{k}{Number of top values to get.} } \value{ Expr diff --git a/man/Expr_over.Rd b/man/Expr_over.Rd index 84bcc9256..4559f34e7 100644 --- a/man/Expr_over.Rd +++ b/man/Expr_over.Rd @@ -4,12 +4,15 @@ \alias{Expr_over} \title{Compute expressions over the given groups} \usage{ -Expr_over(..., mapping_strategy = "group_to_rows") +Expr_over(..., order_by = NULL, mapping_strategy = "group_to_rows") } \arguments{ \item{...}{Column(s) to group by. Accepts expression input. Characters are parsed as column names.} +\item{order_by}{Order the window functions/aggregations with the partitioned +groups by the result of the expression passed to \code{order_by}.} + \item{mapping_strategy}{One of the following: \itemize{ \item \code{"group_to_rows"} (default): if the aggregation results in multiple values, diff --git a/man/Expr_top_k.Rd b/man/Expr_top_k.Rd index 7601ba956..8ab61738d 100644 --- a/man/Expr_top_k.Rd +++ b/man/Expr_top_k.Rd @@ -4,25 +4,10 @@ \alias{Expr_top_k} \title{Top k values} \usage{ -Expr_top_k( - k, - ..., - nulls_last = FALSE, - maintain_order = FALSE, - multithreaded = TRUE -) +Expr_top_k(k) } \arguments{ -\item{k}{Number of top values to get} - -\item{...}{Ignored.} - -\item{nulls_last}{Place null values last.} - -\item{maintain_order}{Whether the order should be maintained if elements are -equal.} - -\item{multithreaded}{Sort using multiple threads.} +\item{k}{Number of top values to get.} } \value{ Expr diff --git a/man/Expr_value_counts.Rd b/man/Expr_value_counts.Rd index 7bbab1c40..2e3d7f7f9 100644 --- a/man/Expr_value_counts.Rd +++ b/man/Expr_value_counts.Rd @@ -4,7 +4,7 @@ \alias{Expr_value_counts} \title{Value counts} \usage{ -Expr_value_counts(..., sort = FALSE, parallel = FALSE, name = "count") +Expr_value_counts(..., sort = FALSE, parallel = FALSE, name, normalize = FALSE) } \arguments{ \item{...}{Ignored.} @@ -14,8 +14,11 @@ Expr_value_counts(..., sort = FALSE, parallel = FALSE, name = "count") \item{parallel}{Better to turn this off in the aggregation context, as it can lead to contention.} -\item{name}{Give the resulting count field a specific name, defaults to -\code{"count"}.} +\item{name}{Give the resulting count column a specific name. The default is +\code{"count"} if \code{normalize = FALSE} and \code{"proportion"} if \code{normalize = TRUE}.} + +\item{normalize}{If \code{TRUE}, it gives relative frequencies of the unique +values instead of their count.} } \value{ Expr @@ -24,8 +27,7 @@ Expr Count all unique values and create a struct mapping value to count. } \examples{ -df = pl$DataFrame(iris)$select(pl$col("Species")$value_counts()) -df - -df$unnest()$to_data_frame() +df = pl$DataFrame(iris) +df$select(pl$col("Species")$value_counts())$unnest() +df$select(pl$col("Species")$value_counts(normalize = TRUE))$unnest() } diff --git a/man/LazyFrame_melt.Rd b/man/LazyFrame_unpivot.Rd similarity index 80% rename from man/LazyFrame_melt.Rd rename to man/LazyFrame_unpivot.Rd index 057b75fdf..09fdd57cd 100644 --- a/man/LazyFrame_melt.Rd +++ b/man/LazyFrame_unpivot.Rd @@ -1,32 +1,32 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/lazyframe__lazy.R -\name{LazyFrame_melt} -\alias{LazyFrame_melt} +\name{LazyFrame_unpivot} +\alias{LazyFrame_unpivot} \title{Unpivot a Frame from wide to long format} \usage{ -LazyFrame_melt( - id_vars = NULL, - value_vars = NULL, +LazyFrame_unpivot( + on = NULL, + ..., + index = NULL, variable_name = NULL, value_name = NULL, - ..., streamable = TRUE ) } \arguments{ -\item{id_vars}{Columns to use as identifier variables.} - -\item{value_vars}{Values to use as identifier variables. If \code{value_vars} is +\item{on}{Values to use as identifier variables. If \code{value_vars} is empty all columns that are not in \code{id_vars} will be used.} +\item{...}{Not used.} + +\item{index}{Columns to use as identifier variables.} + \item{variable_name}{Name to give to the new column containing the names of the melted columns. Defaults to "variable".} \item{value_name}{Name to give to the new column containing the values of the melted columns. Defaults to "value"} -\item{...}{Not used.} - \item{streamable}{Allow this node to run in the streaming engine. If this runs in streaming, the output of the melt operation will not have a stable ordering.} @@ -51,6 +51,6 @@ lf = pl$LazyFrame( b = c(1, 3, 5), c = c(2, 4, 6) ) -lf$melt(id_vars = "a", value_vars = c("b", "c"))$collect() +lf$unpivot(index = "a", on = c("b", "c"))$collect() } \keyword{LazyFrame} diff --git a/man/Series_value_counts.Rd b/man/Series_value_counts.Rd index 88725a6d8..cd0423392 100644 --- a/man/Series_value_counts.Rd +++ b/man/Series_value_counts.Rd @@ -14,8 +14,8 @@ Series_value_counts(..., sort = TRUE, parallel = FALSE, name = "count") \item{parallel}{Better to turn this off in the aggregation context, as it can lead to contention.} -\item{name}{Give the resulting count field a specific name, defaults to -\code{"count"}.} +\item{name}{Give the resulting count column a specific name. The default is +\code{"count"} if \code{normalize = FALSE} and \code{"proportion"} if \code{normalize = TRUE}.} } \value{ DataFrame diff --git a/man/pl_date_range.Rd b/man/pl_date_range.Rd index 5fda132b7..08d6377c5 100644 --- a/man/pl_date_range.Rd +++ b/man/pl_date_range.Rd @@ -4,15 +4,7 @@ \alias{pl_date_range} \title{Generate a date range} \usage{ -pl_date_range( - start, - end, - interval = "1d", - ..., - closed = "both", - time_unit = NULL, - time_zone = NULL -) +pl_date_range(start, end, interval = "1d", ..., closed = "both") } \arguments{ \item{start}{Lower bound of the date range. Something that can be coerced to @@ -29,14 +21,6 @@ See the \verb{Polars duration string language} section for details.} \item{closed}{Define which sides of the range are closed (inclusive). One of the followings: \code{"both"} (default), \code{"left"}, \code{"right"}, \code{"none"}.} - -\item{time_unit}{Time unit of the resulting the \link[=DataType_Datetime]{Datetime} -data type. One of \code{"ns"}, \code{"us"}, \code{"ms"} or \code{NULL}. Only takes effect if the -output column is of type \link[=DataType_Datetime]{Datetime} (deprecated usage).} - -\item{time_zone}{Time zone of the resulting \link[=DataType_Datetime]{Datetime} data -type. Only takes effect if the output column is of type \link[=DataType_Datetime]{Datetime} -(deprecated usage).} } \value{ An \link[=Expr_class]{Expr} of data type Date or \link[=DataType_Datetime]{Datetime} diff --git a/man/pl_date_ranges.Rd b/man/pl_date_ranges.Rd index 8c31e3365..4c336fc3e 100644 --- a/man/pl_date_ranges.Rd +++ b/man/pl_date_ranges.Rd @@ -4,15 +4,7 @@ \alias{pl_date_ranges} \title{Generate a list containing a date range} \usage{ -pl_date_ranges( - start, - end, - interval = "1d", - ..., - closed = "both", - time_unit = NULL, - time_zone = NULL -) +pl_date_ranges(start, end, interval = "1d", ..., closed = "both") } \arguments{ \item{start}{Lower bound of the date range. Something that can be coerced to @@ -29,14 +21,6 @@ See the \verb{Polars duration string language} section for details.} \item{closed}{Define which sides of the range are closed (inclusive). One of the followings: \code{"both"} (default), \code{"left"}, \code{"right"}, \code{"none"}.} - -\item{time_unit}{Time unit of the resulting the \link[=DataType_Datetime]{Datetime} -data type. One of \code{"ns"}, \code{"us"}, \code{"ms"} or \code{NULL}. Only takes effect if the -output column is of type \link[=DataType_Datetime]{Datetime} (deprecated usage).} - -\item{time_zone}{Time zone of the resulting \link[=DataType_Datetime]{Datetime} data -type. Only takes effect if the output column is of type \link[=DataType_Datetime]{Datetime} -(deprecated usage).} } \value{ An \link[=Expr_class]{Expr} of data type List(Date) or diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index 08a79d57f..6771d836f 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -110,19 +110,19 @@ [11] "export_stream" "from_arrow_record_batches" [13] "from_raw_ipc" "get_column" [15] "get_columns" "lazy" - [17] "melt" "n_chunks" - [19] "new_with_capacity" "null_count" - [21] "partition_by" "pivot_expr" - [23] "print" "rechunk" - [25] "sample_frac" "sample_n" - [27] "schema" "select" - [29] "select_at_idx" "select_seq" - [31] "set_column_from_robj" "set_column_from_series" - [33] "set_column_names_mut" "shape" - [35] "to_list" "to_list_tag_structs" - [37] "to_list_unwind" "to_raw_ipc" - [39] "to_struct" "transpose" - [41] "unnest" "with_columns" + [17] "n_chunks" "new_with_capacity" + [19] "null_count" "partition_by" + [21] "pivot_expr" "print" + [23] "rechunk" "sample_frac" + [25] "sample_n" "schema" + [27] "select" "select_at_idx" + [29] "select_seq" "set_column_from_robj" + [31] "set_column_from_series" "set_column_names_mut" + [33] "shape" "to_list" + [35] "to_list_tag_structs" "to_list_unwind" + [37] "to_raw_ipc" "to_struct" + [39] "transpose" "unnest" + [41] "unpivot" "with_columns" [43] "with_columns_seq" "with_row_index" [45] "write_csv" "write_ipc" [47] "write_json" "write_ndjson" @@ -192,20 +192,20 @@ [19] "join" "join_asof" [21] "last" "max" [23] "mean" "median" - [25] "melt" "min" - [27] "print" "profile" - [29] "quantile" "rename" - [31] "reverse" "rolling" - [33] "schema" "select" - [35] "select_seq" "serialize" - [37] "set_optimization_toggle" "shift" - [39] "shift_and_fill" "sink_csv" - [41] "sink_ipc" "sink_json" - [43] "sink_parquet" "slice" - [45] "sort_by_exprs" "std" - [47] "sum" "tail" - [49] "to_dot" "unique" - [51] "unnest" "var" + [25] "min" "print" + [27] "profile" "quantile" + [29] "rename" "reverse" + [31] "rolling" "schema" + [33] "select" "select_seq" + [35] "serialize" "set_optimization_toggle" + [37] "shift" "shift_and_fill" + [39] "sink_csv" "sink_ipc" + [41] "sink_json" "sink_parquet" + [43] "slice" "sort_by_exprs" + [45] "std" "sum" + [47] "tail" "to_dot" + [49] "unique" "unnest" + [51] "unpivot" "var" [53] "with_columns" "with_columns_seq" [55] "with_context" "with_row_index" @@ -399,56 +399,56 @@ [229] "rank" "rechunk" [231] "reinterpret" "rem" [233] "rep" "repeat_by" - [235] "replace" "reshape" - [237] "reverse" "rle" - [239] "rle_id" "rolling" - [241] "rolling_corr" "rolling_cov" - [243] "rolling_max" "rolling_max_by" - [245] "rolling_mean" "rolling_mean_by" - [247] "rolling_median" "rolling_median_by" - [249] "rolling_min" "rolling_min_by" - [251] "rolling_quantile" "rolling_quantile_by" - [253] "rolling_skew" "rolling_std" - [255] "rolling_std_by" "rolling_sum" - [257] "rolling_sum_by" "rolling_var" - [259] "rolling_var_by" "round" - [261] "sample_frac" "sample_n" - [263] "search_sorted" "shift" - [265] "shift_and_fill" "shrink_dtype" - [267] "shuffle" "sign" - [269] "sin" "sinh" - [271] "skew" "slice" - [273] "sort_by" "sort_with" - [275] "std" "str_base64_decode" - [277] "str_base64_encode" "str_concat" + [235] "replace" "replace_strict" + [237] "reshape" "reverse" + [239] "rle" "rle_id" + [241] "rolling" "rolling_corr" + [243] "rolling_cov" "rolling_max" + [245] "rolling_max_by" "rolling_mean" + [247] "rolling_mean_by" "rolling_median" + [249] "rolling_median_by" "rolling_min" + [251] "rolling_min_by" "rolling_quantile" + [253] "rolling_quantile_by" "rolling_skew" + [255] "rolling_std" "rolling_std_by" + [257] "rolling_sum" "rolling_sum_by" + [259] "rolling_var" "rolling_var_by" + [261] "round" "sample_frac" + [263] "sample_n" "search_sorted" + [265] "shift" "shift_and_fill" + [267] "shrink_dtype" "shuffle" + [269] "sign" "sin" + [271] "sinh" "skew" + [273] "slice" "sort_by" + [275] "sort_with" "std" + [277] "str_base64_decode" "str_base64_encode" [279] "str_contains" "str_contains_any" [281] "str_count_matches" "str_ends_with" [283] "str_extract" "str_extract_all" [285] "str_extract_groups" "str_find" [287] "str_head" "str_hex_decode" - [289] "str_hex_encode" "str_json_decode" - [291] "str_json_path_match" "str_len_bytes" - [293] "str_len_chars" "str_pad_end" - [295] "str_pad_start" "str_replace" - [297] "str_replace_all" "str_replace_many" - [299] "str_reverse" "str_slice" - [301] "str_split" "str_split_exact" - [303] "str_splitn" "str_starts_with" - [305] "str_strip_chars" "str_strip_chars_end" - [307] "str_strip_chars_start" "str_tail" - [309] "str_to_date" "str_to_datetime" - [311] "str_to_integer" "str_to_lowercase" - [313] "str_to_time" "str_to_titlecase" - [315] "str_to_uppercase" "str_zfill" - [317] "struct_field_by_name" "struct_rename_fields" - [319] "struct_with_fields" "sub" - [321] "sum" "tail" - [323] "tan" "tanh" - [325] "to_physical" "top_k" - [327] "unique" "unique_counts" - [329] "unique_stable" "upper_bound" - [331] "value_counts" "var" - [333] "xor" + [289] "str_hex_encode" "str_join" + [291] "str_json_decode" "str_json_path_match" + [293] "str_len_bytes" "str_len_chars" + [295] "str_pad_end" "str_pad_start" + [297] "str_replace" "str_replace_all" + [299] "str_replace_many" "str_reverse" + [301] "str_slice" "str_split" + [303] "str_split_exact" "str_splitn" + [305] "str_starts_with" "str_strip_chars" + [307] "str_strip_chars_end" "str_strip_chars_start" + [309] "str_tail" "str_to_date" + [311] "str_to_datetime" "str_to_integer" + [313] "str_to_lowercase" "str_to_time" + [315] "str_to_titlecase" "str_to_uppercase" + [317] "str_zfill" "struct_field_by_name" + [319] "struct_rename_fields" "struct_with_fields" + [321] "sub" "sum" + [323] "tail" "tan" + [325] "tanh" "to_physical" + [327] "top_k" "unique" + [329] "unique_counts" "unique_stable" + [331] "upper_bound" "value_counts" + [333] "var" "xor" # public and private methods of each class When diff --git a/tests/testthat/test-expr_expr.R b/tests/testthat/test-expr_expr.R index 7ccc2026e..10f938f86 100644 --- a/tests/testthat/test-expr_expr.R +++ b/tests/testthat/test-expr_expr.R @@ -2381,6 +2381,18 @@ test_that("$value_counts", { count = rep(50, 3) ) ) + + # arg "normalize" + expect_equal( + df$select(pl$col("Species")$value_counts(normalize = TRUE))$ + unnest()$ + sort("Species")$ + to_data_frame(), + data.frame( + Species = factor(c("setosa", "versicolor", "virginica")), + proportion = rep(0.33333333, 3) + ) + ) }) From 6ecab19fd94d6f8446c2207cf366511e04810460 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 16:16:38 +0200 Subject: [PATCH 21/47] more fixes [skip ci] --- R/dataframe__frame.R | 25 ++++++------- R/expr__string.R | 4 +-- R/lazyframe__lazy.R | 14 ++++++-- R/utils.R | 29 +++++++++++++++ tests/testthat/_snaps/after-wrappers.md | 48 ++++++++++++------------- tests/testthat/_snaps/lazy.md | 6 ++-- tests/testthat/test-dataframe.R | 36 +++++++++---------- 7 files changed, 100 insertions(+), 62 deletions(-) diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index e90b8f089..e204b1c86 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1519,9 +1519,9 @@ DataFrame_unpivot = function( #' Pivot data from long to wide #' @param values Column values to aggregate. Can be multiple columns if the -#' `columns` arguments contains multiple columns as well. +#' `on` arguments contains multiple columns as well. #' @param index One or multiple keys to group by. -#' @param columns Name of the column(s) whose values will be used as the header +#' @param on Name of the column(s) whose values will be used as the header #' of the output DataFrame. #' @param ... Not used. #' @param aggregate_function One of: @@ -1545,7 +1545,7 @@ DataFrame_unpivot = function( #' df #' #' df$pivot( -#' values = "baz", index = "foo", columns = "bar" +#' values = "baz", index = "foo", on = "bar" #' ) #' #' # Run an expression as aggregation function @@ -1558,15 +1558,15 @@ DataFrame_unpivot = function( #' #' df$pivot( #' index = "col1", -#' columns = "col2", +#' on = "col2", #' values = "col3", #' aggregate_function = pl$element()$tanh()$mean() #' ) DataFrame_pivot = function( - values, - index, - columns, + on, ..., + index, + values, aggregate_function = NULL, maintain_order = TRUE, sort_columns = FALSE, @@ -1587,7 +1587,7 @@ DataFrame_pivot = function( )) |> # run pivot when valid aggregate_expr and_then(\(aggregate_expr) .pr$DataFrame$pivot_expr( - self, index, columns, values, maintain_order, sort_columns, aggregate_expr, separator + self, on, index, values, maintain_order, sort_columns, aggregate_expr, separator )) |> # unwrap and add method context name unwrap("in $pivot():") @@ -1737,7 +1737,7 @@ DataFrame_describe = function(percentiles = c(.25, .75), interpolation = "neares )$ unnest("fields")$ drop("column")$ - pivot(index = "statistic", columns = "variable", values = "column_0")$ + pivot(index = "statistic", on = "variable", values = "column_0")$ with_columns(statistic = pl$lit(metrics)) }) |> uw() @@ -1763,12 +1763,12 @@ DataFrame_glimpse = function(..., return_as_string = FALSE) { } # closure to extract col info from a column in - max_num_value = min(10, self$height) + max_num_value = as.integer(min(10, self$height)) max_col_name_trunc = 50 parse_column_ = \(col_name, dtype) { dtype_str = dtype_str_repr(dtype) |> unwrap_or(paste0("??", str_string(dtype))) if (inherits(dtype, "RPolarsDataType")) dtype_str = paste0(" <", dtype_str, ">") - val = self$select(pl$col(col_name)$slice(0, max_num_value))$to_list()[[1]] + val = self$select(pl$col(col_name)$slice(0L, max_num_value))$to_list()[[1]] val_str = paste(val, collapse = ", ") if (nchar(col_name) > max_col_name_trunc) { col_name = paste0(substr(col_name, 1, max_col_name_trunc - 3), "...") @@ -2032,9 +2032,10 @@ DataFrame_write_parquet = function( ..., compression = "zstd", compression_level = 3, - statistics = FALSE, + statistics = TRUE, row_group_size = NULL, data_pagesize_limit = NULL) { + statistics = translate_statistics(statistics) .pr$DataFrame$write_parquet( self, file, diff --git a/R/expr__string.R b/R/expr__string.R index 5ff11a3d4..1cb764373 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -244,7 +244,7 @@ ExprStr_join = function( ..., ignore_nulls = TRUE) { .pr$Expr$str_join(self, delimiter, ignore_nulls) |> - unwrap("in $join():") + unwrap("in $str$join():") } ExprStr_concat = function( @@ -253,7 +253,7 @@ ExprStr_concat = function( ignore_nulls = TRUE) { warning("$str$concat() is deprecated as of 0.18.0. Use $str$join() instead.") .pr$Expr$str_join(self, delimiter, ignore_nulls) |> - unwrap("in $concat():") + unwrap("in $str$concat():") } #' Convert a string to uppercase diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 02eba733d..fef12c014 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -629,8 +629,14 @@ LazyFrame_collect_in_background = function() { #' * "gzip": min-level: 0, max-level: 10. #' * "brotli": min-level: 0, max-level: 11. #' * "zstd": min-level: 1, max-level: 22. -#' @param statistics Logical. Whether compute and write column statistics. -#' This requires extra compute. +#' @param statistics Whether statistics should be written to the Parquet +#' headers. Possible values: +#' * `TRUE`: enable default set of statistics (default) +#' * `FALSE`: disable all statistics +#' * `"full"`: calculate and write all available statistics. +#' * A named list where all values must be `TRUE` or `FALSE`, e.g. +#' `list(min = TRUE, max = FALSE)`. Statistics available are `"min"`, `"max"`, +#' `"distinct_count"`, `"null_count"`. #' @param row_group_size `NULL` or Integer. Size of the row groups in number of #' rows. If `NULL` (default), the chunks of the DataFrame are used. Writing in #' smaller chunks may reduce memory pressure and improve writing speeds. @@ -661,7 +667,7 @@ LazyFrame_sink_parquet = function( ..., compression = "zstd", compression_level = 3, - statistics = FALSE, + statistics = TRUE, row_group_size = NULL, data_pagesize_limit = NULL, maintain_order = TRUE, @@ -693,6 +699,8 @@ LazyFrame_sink_parquet = function( ) |> unwrap("in $sink_parquet()") } + statistics = translate_statistics(statistics) + lf |> .pr$LazyFrame$sink_parquet( path, diff --git a/R/utils.R b/R/utils.R index 45c246fb3..296ec69cb 100644 --- a/R/utils.R +++ b/R/utils.R @@ -678,3 +678,32 @@ is_named = function(x) { } TRUE } + +# Used in parquet write/sink +translate_statistics = function(statistics) { + if (is.logical(statistics)) { + if (isTRUE(statistics)) { + statistics = list( + min = TRUE, + max = TRUE, + distinct_count = FALSE, + null_count = TRUE + ) + } else { + statistics = list( + min = FALSE, + max = FALSE, + distinct_count = FALSE, + null_count = FALSE + ) + } + } else if (is.character(statistics) && statistics == "full") { + statistics = list( + min = TRUE, + max = TRUE, + distinct_count = TRUE, + null_count = TRUE + ) + } + statistics +} diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index 6771d836f..be3569a32 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -85,15 +85,15 @@ [21] "group_by" "group_by_dynamic" "head" "height" [25] "item" "join" "join_asof" "last" [29] "lazy" "limit" "max" "mean" - [33] "median" "melt" "min" "n_chunks" - [37] "null_count" "partition_by" "pivot" "print" - [41] "quantile" "rechunk" "rename" "reverse" - [45] "rolling" "sample" "schema" "select" - [49] "select_seq" "shape" "shift" "shift_and_fill" - [53] "slice" "sort" "sql" "std" - [57] "sum" "tail" "to_data_frame" "to_list" - [61] "to_raw_ipc" "to_series" "to_struct" "transpose" - [65] "unique" "unnest" "var" "width" + [33] "median" "min" "n_chunks" "null_count" + [37] "partition_by" "pivot" "print" "quantile" + [41] "rechunk" "rename" "reverse" "rolling" + [45] "sample" "schema" "select" "select_seq" + [49] "shape" "shift" "shift_and_fill" "slice" + [53] "sort" "sql" "std" "sum" + [57] "tail" "to_data_frame" "to_list" "to_raw_ipc" + [61] "to_series" "to_struct" "transpose" "unique" + [65] "unnest" "unpivot" "var" "width" [69] "with_columns" "with_columns_seq" "with_row_index" "write_csv" [73] "write_ipc" "write_json" "write_ndjson" "write_parquet" @@ -156,21 +156,21 @@ [21] "join" "join_asof" [23] "last" "limit" [25] "max" "mean" - [27] "median" "melt" - [29] "min" "print" - [31] "profile" "quantile" - [33] "rename" "reverse" - [35] "rolling" "schema" - [37] "select" "select_seq" - [39] "serialize" "set_optimization_toggle" - [41] "shift" "shift_and_fill" - [43] "sink_csv" "sink_ipc" - [45] "sink_ndjson" "sink_parquet" - [47] "slice" "sort" - [49] "sql" "std" - [51] "sum" "tail" - [53] "to_dot" "unique" - [55] "unnest" "var" + [27] "median" "min" + [29] "print" "profile" + [31] "quantile" "rename" + [33] "reverse" "rolling" + [35] "schema" "select" + [37] "select_seq" "serialize" + [39] "set_optimization_toggle" "shift" + [41] "shift_and_fill" "sink_csv" + [43] "sink_ipc" "sink_ndjson" + [45] "sink_parquet" "slice" + [47] "sort" "sql" + [49] "std" "sum" + [51] "tail" "to_dot" + [53] "unique" "unnest" + [55] "unpivot" "var" [57] "width" "with_columns" [59] "with_columns_seq" "with_context" [61] "with_row_index" diff --git a/tests/testthat/_snaps/lazy.md b/tests/testthat/_snaps/lazy.md index 97df90723..292a4bc47 100644 --- a/tests/testthat/_snaps/lazy.md +++ b/tests/testthat/_snaps/lazy.md @@ -57,8 +57,7 @@ } }, "output_schema": null, - "projection": null, - "selection": null + "filter": null } }, "predicate": { @@ -78,7 +77,8 @@ }, "options": { "run_parallel": true, - "duplicate_check": true + "duplicate_check": true, + "should_broadcast": true } } } diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index b191f3aba..46bba75a1 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -969,7 +969,7 @@ test_that("pivot examples", { expect_identical( df$pivot( - values = "baz", index = "foo", columns = "bar", aggregate_function = "first" + values = "baz", index = "foo", on = "bar", aggregate_function = "first" )$to_list(), list(foo = c("one", "two"), A = c(1, 4), B = c(2, 5), C = c(3, 6)) ) @@ -985,7 +985,7 @@ test_that("pivot examples", { expect_equal( df$pivot( index = "col1", - columns = "col2", + on = "col2", values = "col3", aggregate_function = pl$element()$tanh()$mean() )$to_list(), @@ -1006,7 +1006,7 @@ test_that("pivot args works", { jaz = 6:1 ) expect_identical( - df$pivot("foo", "bar", "baz")$to_list(), + df$pivot("baz", index = "bar", values = "foo")$to_list(), list(bar = c("A", "B", "C"), `1.0` = c("one", NA, NA), `2.0` = c( NA, "one", NA @@ -1021,39 +1021,39 @@ test_that("pivot args works", { # aggr functions expect_identical( - df$pivot("cat", "ann", "bob", aggregate_function = "mean")$to_list(), + df$pivot("bob", index = "ann", values = "cat", aggregate_function = "mean")$to_list(), list(ann = c("one", "two"), A = c(2, 5), B = c(2, 5)) ) expect_identical( - df$pivot("cat", "ann", "bob", aggregate_function = pl$element()$mean())$to_list(), - df$pivot("cat", "ann", "bob", aggregate_function = "mean")$to_list() + df$pivot("bob", index = "ann", values = "cat", aggregate_function = pl$element()$mean())$to_list(), + df$pivot("bob", index = "ann", values = "cat", aggregate_function = "mean")$to_list() ) expect_grepl_error( - df$pivot("ann", "bob", "cat", aggregate_function = 42), - c("pivot", "param", "aggregate_function", "42") + df$pivot("cat", index = "bob", values = "ann", aggregate_function = 42), + "is neither a string, NULL or an Expr" ) expect_grepl_error( - df$pivot("ann", "bob", "cat", aggregate_function = "dummy"), - c("pivot", "dummy is not a method") + df$pivot("cat", index = "bob", values = "ann", aggregate_function = "dummy"), + "dummy is not a method" ) # maintain_order sort_columns expect_grepl_error( - df$pivot("ann", "bob", "cat", aggregate_function = "mean", maintain_order = 42), - c("pivot", "maintain_order", "bool") + df$pivot("cat", index = "bob", values = "ann", aggregate_function = "mean", maintain_order = 42), + "Expected a value of type \\[bool\\]" ) expect_grepl_error( - df$pivot("ann", "bob", "cat", aggregate_function = "mean", sort_columns = 42), - c("pivot", "sort_columns", "bool") + df$pivot("cat", index = "bob", values = "ann", aggregate_function = "mean", sort_columns = 42), + "Expected a value of type \\[bool\\]" ) # separator expect_named( - df$pivot(c("ann", "bob"), "ann", "cat", aggregate_function = "mean", separator = "."), + df$pivot("cat", index = "ann", values = c("ann", "bob"), aggregate_function = "mean", separator = "."), c( - "ann", "ann.cat.1.0", "ann.cat.2.0", "ann.cat.3.0", "ann.cat.4.0", - "ann.cat.5.0", "ann.cat.6.0", "bob.cat.1.0", "bob.cat.2.0", "bob.cat.3.0", - "bob.cat.4.0", "bob.cat.5.0", "bob.cat.6.0" + "ann", "ann.1.0", "ann.2.0", "ann.3.0", "ann.4.0", + "ann.5.0", "ann.6.0", "bob.1.0", "bob.2.0", "bob.3.0", + "bob.4.0", "bob.5.0", "bob.6.0" ) ) }) From a52f1ece85060f82e152365d42ee996356c1022f Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 16:39:56 +0200 Subject: [PATCH 22/47] fix handling of stats for parquet [skip ci] --- R/functions__eager.R | 4 ++-- src/rust/src/rdatatype.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/functions__eager.R b/R/functions__eager.R index 3f107f402..39f4c2a21 100644 --- a/R/functions__eager.R +++ b/R/functions__eager.R @@ -207,7 +207,7 @@ pl_date_range = function( ..., closed = "both") { interval = parse_as_polars_duration_string(interval) - date_range(start, end, interval, closed, time_unit, time_zone) |> + date_range(start, end, interval, closed) |> unwrap("in pl$date_range():") } @@ -251,7 +251,7 @@ pl_date_ranges = function( ..., closed = "both") { interval = parse_as_polars_duration_string(interval) - date_ranges(start, end, interval, closed, time_unit, time_zone) |> + date_ranges(start, end, interval, closed) |> unwrap("in pl$date_ranges():") } diff --git a/src/rust/src/rdatatype.rs b/src/rust/src/rdatatype.rs index 7b46dd80f..17249fcc4 100644 --- a/src/rust/src/rdatatype.rs +++ b/src/rust/src/rdatatype.rs @@ -730,8 +730,8 @@ pub fn robj_to_statistics_options(robj: Robj) -> RResult .map(|xi| (xi.0, xi.1.as_bool().unwrap())) .collect::>(); let mut out = SO::default(); - out.min_value = *hm.get(&"min_value").unwrap(); - out.max_value = *hm.get(&"max_value").unwrap(); + out.min_value = *hm.get(&"min").unwrap(); + out.max_value = *hm.get(&"max").unwrap(); out.distinct_count = *hm.get(&"distinct_count").unwrap(); out.null_count = *hm.get(&"null_count").unwrap(); Ok(out) From 381a756575cf40484ce9dbca150bae9e24df46db Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 17:13:56 +0200 Subject: [PATCH 23/47] bunch of fixes [skip ci] --- R/expr__expr.R | 80 ++++++++++++++++++++++++----- R/series__series.R | 20 +++++--- man/DataFrame_pivot.Rd | 22 ++++---- man/Expr_replace.Rd | 25 +++------ man/Expr_replace_strict.Rd | 66 ++++++++++++++++++++++++ man/IO_sink_parquet.Rd | 14 +++-- man/IO_write_parquet.Rd | 14 +++-- man/Series_value_counts.Rd | 11 +++- tests/testthat/test-dataframe.R | 38 +++++++------- tests/testthat/test-expr_datetime.R | 8 --- tests/testthat/test-expr_expr.R | 67 ++++++++++++++++++++---- tests/testthat/test-expr_string.R | 16 ++++-- tests/testthat/test-melt.R | 12 ++--- 13 files changed, 291 insertions(+), 102 deletions(-) create mode 100644 man/Expr_replace_strict.Rd diff --git a/R/expr__expr.R b/R/expr__expr.R index 984dead61..a413b6b51 100644 --- a/R/expr__expr.R +++ b/R/expr__expr.R @@ -3564,9 +3564,11 @@ Expr_rolling = function( unwrap("in $rolling():") } -#' Replace values by different values +#' Replace the given values by different values of the same data type. #' -#' This allows one to recode values in a column. +#' This allows one to recode values in a column, leaving all other values +#' unchanged. See [`$replace_strict()`][Expr_replace_strict] to give a default +#' value to all other values and to specify the output datatype. #' #' @param old Can be several things: #' * a vector indicating the values to recode; @@ -3576,11 +3578,6 @@ Expr_rolling = function( #' * an Expr #' @param new Either a vector of length 1, a vector of same length as `old` or #' an Expr. If missing, `old` must be a named list. -#' @param default The default replacement if the value is not in `old`. Can be -#' an Expr. If `NULL` (default), then the value doesn't change. -#' @param return_dtype The data type of the resulting expression. If set to -#' `NULL` (default), the data type is determined automatically based on the -#' other inputs. #' #' @return Expr #' @examples @@ -3593,33 +3590,88 @@ Expr_rolling = function( #' # "old" can be a named list where names are values to replace, and values are #' # the replacements #' mapping = list(`2` = 100, `3` = 200) -#' df$with_columns(replaced = pl$col("a")$replace(mapping, default = -1)) +#' df$with_columns(replaced = pl$col("a")$replace(mapping)) #' #' df = pl$DataFrame(a = c("x", "y", "z")) #' mapping = list(x = 1, y = 2, z = 3) #' df$with_columns(replaced = pl$col("a")$replace(mapping)) #' -#' # one can specify the data type to return instead of automatically inferring it -#' df$with_columns(replaced = pl$col("a")$replace(mapping, return_dtype = pl$Int8)) +#' # "old" and "new" can take Expr +#' df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) +#' df$with_columns( +#' replaced = pl$col("a")$replace( +#' old = pl$col("a")$max(), +#' new = pl$col("b")$sum() +#' ) +#' ) +Expr_replace = function(old, new) { + if (missing(new) && is.list(old)) { + new = unlist(old, use.names = FALSE) + old = names(old) + } + .pr$Expr$replace(self, old, new) |> + unwrap("in $replace():") +} + + +#' Replace all values by different values. +#' +#' This changes all the values in a column, either using a specific replacement +#' or a default one. See [`$replace()`][Expr_replace] to replace only a subset +#' of values. +#' +#' @inheritParams Expr_replace +#' @param default The default replacement if the value is not in `old`. Can be +#' an Expr. If `NULL` (default), then the value doesn't change. +#' @param return_dtype The data type of the resulting expression. If set to +#' `NULL` (default), the data type is determined automatically based on the +#' other inputs. +#' +#' @return Expr +#' @examples +#' df = pl$DataFrame(a = c(1, 2, 2, 3)) +#' +#' # "old" and "new" can take vectors of length 1 or of same length +#' df$with_columns(replaced = pl$col("a")$replace_strict(2, 100, default = 1)) +#' df$with_columns( +#' replaced = pl$col("a")$replace_strict(c(2, 3), c(100, 200), default = 1) +#' ) +#' +#' # "old" can be a named list where names are values to replace, and values are +#' # the replacements +#' mapping = list(`2` = 100, `3` = 200) +#' df$with_columns(replaced = pl$col("a")$replace_strict(mapping, default = -1)) +#' +#' # one can specify the data type to return instead of automatically +#' # inferring it +#' df$with_columns( +#' replaced = pl$col("a")$replace_strict(mapping, default = 1, return_dtype = pl$Int32) +#' ) #' #' # "old", "new", and "default" can take Expr #' df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) #' df$with_columns( -#' replaced = pl$col("a")$replace( +#' replaced = pl$col("a")$replace_strict( #' old = pl$col("a")$max(), #' new = pl$col("b")$sum(), #' default = pl$col("b"), #' ) #' ) -Expr_replace = function(old, new, default = NULL, return_dtype = NULL) { +Expr_replace_strict = function(old, new, default = NULL, return_dtype = NULL) { if (missing(new) && is.list(old)) { new = unlist(old, use.names = FALSE) old = names(old) } - .pr$Expr$replace(self, old, new, default, return_dtype) |> - unwrap("in $replace():") + # return_dtype = pl$foo is silently passed otherwise + if (!missing(return_dtype) && !is_polars_dtype(return_dtype)) { + Err_plain("`return_dtype` must be a valid dtype.") |> + unwrap("in $replace_strict():") + } + .pr$Expr$replace_strict(self, old, new, default, return_dtype) |> + unwrap("in $replace_strict():") } + #' Get the lengths of runs of identical values #' #' @return Expr diff --git a/R/series__series.R b/R/series__series.R index 7723919a9..6e8607357 100644 --- a/R/series__series.R +++ b/R/series__series.R @@ -371,7 +371,8 @@ Series_print = function() { #' #' as_polars_series("a")$add("-z") Series_add = function(other) { - .pr$Series$add(self, as_polars_series(other)) + .pr$Series$add(self, as_polars_series(other)) |> + unwrap("in $add():") } @@ -390,7 +391,8 @@ Series_add = function(other) { #' 1L - as_polars_series(1:3) #' as_polars_series(1:3) - 1L Series_sub = function(other) { - .pr$Series$sub(self, as_polars_series(other)) + .pr$Series$sub(self, as_polars_series(other)) |> + unwrap("in $sub():") } @@ -405,7 +407,8 @@ Series_sub = function(other) { #' as_polars_series(1:3)$div(as_polars_series(11:13)) #' as_polars_series(1:3)$div(1L) Series_div = function(other) { - .pr$Series$div(self, as_polars_series(other)) + .pr$Series$div(self, as_polars_series(other)) |> + unwrap("in $div():") } @@ -435,7 +438,8 @@ Series_floor_div = function(other) { #' as_polars_series(1:3)$mul(as_polars_series(11:13)) #' as_polars_series(1:3)$mul(1L) Series_mul = function(other) { - .pr$Series$mul(self, as_polars_series(other)) + .pr$Series$mul(self, as_polars_series(other)) |> + unwrap("in $mul():") } @@ -450,7 +454,8 @@ Series_mul = function(other) { #' as_polars_series(1:3)$mod(as_polars_series(11:13)) #' as_polars_series(1:3)$mod(1L) Series_mod = function(other) { - .pr$Series$rem(self, as_polars_series(other)) + .pr$Series$rem(self, as_polars_series(other)) |> + unwrap("in $mod():") } @@ -578,8 +583,9 @@ Series_to_list = \(int64_conversion = polars_options()$int64_conversion) { #' @return DataFrame #' @examples #' as_polars_series(iris$Species, name = "flower species")$value_counts() -Series_value_counts = function(..., sort = TRUE, parallel = FALSE, name = "count") { - unwrap(.pr$Series$value_counts(self, sort, parallel, name), "in $value_counts():") +Series_value_counts = function(..., sort = TRUE, parallel = FALSE, name = "count", normalize = FALSE) { + .pr$Series$value_counts(self, sort, parallel, name, normalize) |> + unwrap("in $value_counts():") } #' Apply every value with an R fun diff --git a/man/DataFrame_pivot.Rd b/man/DataFrame_pivot.Rd index 2c9634b75..31143411c 100644 --- a/man/DataFrame_pivot.Rd +++ b/man/DataFrame_pivot.Rd @@ -5,10 +5,10 @@ \title{Pivot data from long to wide} \usage{ DataFrame_pivot( - values, - index, - columns, + on, ..., + index, + values, aggregate_function = NULL, maintain_order = TRUE, sort_columns = FALSE, @@ -16,16 +16,16 @@ DataFrame_pivot( ) } \arguments{ -\item{values}{Column values to aggregate. Can be multiple columns if the -\code{columns} arguments contains multiple columns as well.} - -\item{index}{One or multiple keys to group by.} - -\item{columns}{Name of the column(s) whose values will be used as the header +\item{on}{Name of the column(s) whose values will be used as the header of the output DataFrame.} \item{...}{Not used.} +\item{index}{One or multiple keys to group by.} + +\item{values}{Column values to aggregate. Can be multiple columns if the +\code{on} arguments contains multiple columns as well.} + \item{aggregate_function}{One of: \itemize{ \item string indicating the expressions to aggregate with, such as 'first', @@ -56,7 +56,7 @@ df = pl$DataFrame( df df$pivot( - values = "baz", index = "foo", columns = "bar" + values = "baz", index = "foo", on = "bar" ) # Run an expression as aggregation function @@ -69,7 +69,7 @@ df df$pivot( index = "col1", - columns = "col2", + on = "col2", values = "col3", aggregate_function = pl$element()$tanh()$mean() ) diff --git a/man/Expr_replace.Rd b/man/Expr_replace.Rd index c4931ecad..c50e7b348 100644 --- a/man/Expr_replace.Rd +++ b/man/Expr_replace.Rd @@ -2,9 +2,9 @@ % Please edit documentation in R/expr__expr.R \name{Expr_replace} \alias{Expr_replace} -\title{Replace values by different values} +\title{Replace the given values by different values of the same data type.} \usage{ -Expr_replace(old, new, default = NULL, return_dtype = NULL) +Expr_replace(old, new) } \arguments{ \item{old}{Can be several things: @@ -18,19 +18,14 @@ if old values are numeric, the names must be wrapped in backticks; \item{new}{Either a vector of length 1, a vector of same length as \code{old} or an Expr. If missing, \code{old} must be a named list.} - -\item{default}{The default replacement if the value is not in \code{old}. Can be -an Expr. If \code{NULL} (default), then the value doesn't change.} - -\item{return_dtype}{The data type of the resulting expression. If set to -\code{NULL} (default), the data type is determined automatically based on the -other inputs.} } \value{ Expr } \description{ -This allows one to recode values in a column. +This allows one to recode values in a column, leaving all other values +unchanged. See \code{\link[=Expr_replace_strict]{$replace_strict()}} to give a default +value to all other values and to specify the output datatype. } \examples{ df = pl$DataFrame(a = c(1, 2, 2, 3)) @@ -42,22 +37,18 @@ df$with_columns(replaced = pl$col("a")$replace(c(2, 3), c(100, 200))) # "old" can be a named list where names are values to replace, and values are # the replacements mapping = list(`2` = 100, `3` = 200) -df$with_columns(replaced = pl$col("a")$replace(mapping, default = -1)) +df$with_columns(replaced = pl$col("a")$replace(mapping)) df = pl$DataFrame(a = c("x", "y", "z")) mapping = list(x = 1, y = 2, z = 3) df$with_columns(replaced = pl$col("a")$replace(mapping)) -# one can specify the data type to return instead of automatically inferring it -df$with_columns(replaced = pl$col("a")$replace(mapping, return_dtype = pl$Int8)) - -# "old", "new", and "default" can take Expr +# "old" and "new" can take Expr df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) df$with_columns( replaced = pl$col("a")$replace( old = pl$col("a")$max(), - new = pl$col("b")$sum(), - default = pl$col("b"), + new = pl$col("b")$sum() ) ) } diff --git a/man/Expr_replace_strict.Rd b/man/Expr_replace_strict.Rd new file mode 100644 index 000000000..4a93659fd --- /dev/null +++ b/man/Expr_replace_strict.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/expr__expr.R +\name{Expr_replace_strict} +\alias{Expr_replace_strict} +\title{Replace all values by different values.} +\usage{ +Expr_replace_strict(old, new, default = NULL, return_dtype = NULL) +} +\arguments{ +\item{old}{Can be several things: +\itemize{ +\item a vector indicating the values to recode; +\item if \code{new} is missing, this can be a named list e.g \code{list(old = "new")} where +the names are the old values and the values are the replacements. Note that +if old values are numeric, the names must be wrapped in backticks; +\item an Expr +}} + +\item{new}{Either a vector of length 1, a vector of same length as \code{old} or +an Expr. If missing, \code{old} must be a named list.} + +\item{default}{The default replacement if the value is not in \code{old}. Can be +an Expr. If \code{NULL} (default), then the value doesn't change.} + +\item{return_dtype}{The data type of the resulting expression. If set to +\code{NULL} (default), the data type is determined automatically based on the +other inputs.} +} +\value{ +Expr +} +\description{ +This changes all the values in a column, either using a specific replacement +or a default one. See \code{\link[=Expr_replace]{$replace()}} to replace only a subset +of values. +} +\examples{ +df = pl$DataFrame(a = c(1, 2, 2, 3)) + +# "old" and "new" can take vectors of length 1 or of same length +df$with_columns(replaced = pl$col("a")$replace_strict(2, 100, default = 1)) +df$with_columns( + replaced = pl$col("a")$replace_strict(c(2, 3), c(100, 200), default = 1) +) + +# "old" can be a named list where names are values to replace, and values are +# the replacements +mapping = list(`2` = 100, `3` = 200) +df$with_columns(replaced = pl$col("a")$replace_strict(mapping, default = -1)) + +# one can specify the data type to return instead of automatically +# inferring it +df$with_columns( + replaced = pl$col("a")$replace_strict(mapping, default = 1, return_dtype = pl$Int32) +) + +# "old", "new", and "default" can take Expr +df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) +df$with_columns( + replaced = pl$col("a")$replace_strict( + old = pl$col("a")$max(), + new = pl$col("b")$sum(), + default = pl$col("b"), + ) +) +} diff --git a/man/IO_sink_parquet.Rd b/man/IO_sink_parquet.Rd index 7108c04a7..19e558164 100644 --- a/man/IO_sink_parquet.Rd +++ b/man/IO_sink_parquet.Rd @@ -9,7 +9,7 @@ LazyFrame_sink_parquet( ..., compression = "zstd", compression_level = 3, - statistics = FALSE, + statistics = TRUE, row_group_size = NULL, data_pagesize_limit = NULL, maintain_order = TRUE, @@ -48,8 +48,16 @@ means smaller files on disk: \item "zstd": min-level: 1, max-level: 22. }} -\item{statistics}{Logical. Whether compute and write column statistics. -This requires extra compute.} +\item{statistics}{Whether statistics should be written to the Parquet +headers. Possible values: +\itemize{ +\item \code{TRUE}: enable default set of statistics (default) +\item \code{FALSE}: disable all statistics +\item \code{"full"}: calculate and write all available statistics. +\item A named list where all values must be \code{TRUE} or \code{FALSE}, e.g. +\code{list(min = TRUE, max = FALSE)}. Statistics available are \code{"min"}, \code{"max"}, +\code{"distinct_count"}, \code{"null_count"}. +}} \item{row_group_size}{\code{NULL} or Integer. Size of the row groups in number of rows. If \code{NULL} (default), the chunks of the DataFrame are used. Writing in diff --git a/man/IO_write_parquet.Rd b/man/IO_write_parquet.Rd index d29039a03..ef01ef9cf 100644 --- a/man/IO_write_parquet.Rd +++ b/man/IO_write_parquet.Rd @@ -9,7 +9,7 @@ DataFrame_write_parquet( ..., compression = "zstd", compression_level = 3, - statistics = FALSE, + statistics = TRUE, row_group_size = NULL, data_pagesize_limit = NULL ) @@ -40,8 +40,16 @@ means smaller files on disk: \item "zstd": min-level: 1, max-level: 22. }} -\item{statistics}{Logical. Whether compute and write column statistics. -This requires extra compute.} +\item{statistics}{Whether statistics should be written to the Parquet +headers. Possible values: +\itemize{ +\item \code{TRUE}: enable default set of statistics (default) +\item \code{FALSE}: disable all statistics +\item \code{"full"}: calculate and write all available statistics. +\item A named list where all values must be \code{TRUE} or \code{FALSE}, e.g. +\code{list(min = TRUE, max = FALSE)}. Statistics available are \code{"min"}, \code{"max"}, +\code{"distinct_count"}, \code{"null_count"}. +}} \item{row_group_size}{\code{NULL} or Integer. Size of the row groups in number of rows. If \code{NULL} (default), the chunks of the DataFrame are used. Writing in diff --git a/man/Series_value_counts.Rd b/man/Series_value_counts.Rd index cd0423392..ba711ec3f 100644 --- a/man/Series_value_counts.Rd +++ b/man/Series_value_counts.Rd @@ -4,7 +4,13 @@ \alias{Series_value_counts} \title{Count the occurrences of unique values} \usage{ -Series_value_counts(..., sort = TRUE, parallel = FALSE, name = "count") +Series_value_counts( + ..., + sort = TRUE, + parallel = FALSE, + name = "count", + normalize = FALSE +) } \arguments{ \item{...}{Ignored.} @@ -16,6 +22,9 @@ lead to contention.} \item{name}{Give the resulting count column a specific name. The default is \code{"count"} if \code{normalize = FALSE} and \code{"proportion"} if \code{normalize = TRUE}.} + +\item{normalize}{If \code{TRUE}, it gives relative frequencies of the unique +values instead of their count.} } \value{ DataFrame diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index 46bba75a1..ef747bd17 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -331,25 +331,25 @@ test_that("map_batches unity", { }) -test_that("map_batches type", { - int_iris = iris - int_iris[] = lapply(iris, as.integer) - - # auto new type allowed if return is R vector - x = pl$DataFrame(iris)$ - select( - pl$col("Sepal.Length")$ - map_batches(\(s) { - as.integer(s$to_r()) # ok to return R vector also, will be - # converted back to series named "" - })$ - map_batches(\(s) s * 25L)$ - map_batches(\(s) s / 4) - )$ - to_data_frame()[, 1, drop = FALSE] - - expect_identical(x, int_iris[, 1, drop = FALSE] * 25L / 4L) -}) +# test_that("map_batches type", { +# int_iris = iris +# int_iris[] = lapply(iris, as.integer) +# +# # auto new type allowed if return is R vector +# x = pl$DataFrame(iris)$ +# select( +# pl$col("Sepal.Length")$ +# map_batches(\(s) { +# as.integer(s$to_r()) # ok to return R vector also, will be +# # converted back to series named "" +# })$ +# map_batches(\(s) s * 25L)$ +# map_batches(\(s) s / 4) +# )$ +# to_data_frame()[, 1, drop = FALSE] +# +# expect_identical(x, int_iris[, 1, drop = FALSE] * 25L / 4L) +# }) test_that("cloning", { pf = pl$DataFrame(iris) diff --git a/tests/testthat/test-expr_datetime.R b/tests/testthat/test-expr_datetime.R index 718875ec0..531066489 100644 --- a/tests/testthat/test-expr_datetime.R +++ b/tests/testthat/test-expr_datetime.R @@ -90,14 +90,6 @@ test_that("pl$date_range", { )$to_series()$to_vector(), seq(as.Date("2022-01-01"), as.Date("2022-03-01"), by = "1 month") ) - - # Deprecated usage - expect_identical( - suppressWarnings(pl$date_range( - as.POSIXct("2022-01-01 12:00", "UTC"), as.POSIXct("2022-01-03", "UTC"), "1d" - )$to_series()$to_vector()), - as.POSIXct(c("2022-01-01 12:00", "2022-01-02 12:00"), "UTC") - ) }) test_that("dt$truncate", { diff --git a/tests/testthat/test-expr_expr.R b/tests/testthat/test-expr_expr.R index 10f938f86..ac49e68ff 100644 --- a/tests/testthat/test-expr_expr.R +++ b/tests/testthat/test-expr_expr.R @@ -2826,8 +2826,8 @@ test_that("replace works", { # the replacements mapping = list(`2` = 100, `3` = 200) expect_equal( - df$select(replaced = pl$col("a")$replace(mapping, default = -1))$to_list(), - list(replaced = c(-1, 100, 100, 200)) + df$select(replaced = pl$col("a")$replace(mapping))$to_list(), + list(replaced = c(1, 100, 100, 200)) ) df = pl$DataFrame(a = c("x", "y", "z")) @@ -2837,10 +2837,59 @@ test_that("replace works", { list(replaced = c("1.0", "2.0", "3.0")) ) + # "old", "new", and "default" can take Expr + df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) + expect_equal( + df$select( + replaced = pl$col("a")$replace( + old = pl$col("a")$max(), + new = pl$col("b")$sum() + ) + )$to_list(), + list(replaced = c(1, 2, 2, 10)) + ) +}) + +test_that("replace_strict works", { + df = pl$DataFrame(a = c(1, 2, 2, 3)) + + # replace_strict requires a default value + expect_error( + df$select(replaced = pl$col("a")$replace_strict(2, 100, return_dtype = pl$Float32))$to_list(), + "incomplete mapping specified for `replace_strict`" + ) + expect_equal( + df$select(replaced = pl$col("a")$replace_strict(c(2, 3), 999, default = 1))$to_list(), + list(replaced = c(1, 999, 999, 999)) + ) + expect_equal( + df$select(replaced = pl$col("a")$replace_strict(c(2, 3), c(100, 200), default = 1))$to_list(), + list(replaced = c(1, 100, 100, 200)) + ) + + # "old" can be a named list where names are values to replace, and values are + # the replacements + mapping = list(`2` = 100, `3` = 200) + expect_equal( + df$select(replaced = pl$col("a")$replace_strict(mapping, default = -1))$to_list(), + list(replaced = c(-1, 100, 100, 200)) + ) + + df = pl$DataFrame(a = c("x", "y", "z")) + mapping = list(x = 1, y = 2, z = 3) + expect_equal( + df$select(replaced = pl$col("a")$replace_strict(mapping, return_dtype = pl$String))$to_list(), + list(replaced = c("1.0", "2.0", "3.0")) + ) + expect_error( + df$select(pl$col("a")$replace_strict(mapping, return_dtype = pl$foo)), + "must be a valid dtype" + ) + # one can specify the data type to return instead of automatically inferring it expect_equal( df$ - select(replaced = pl$col("a")$replace(mapping, return_dtype = pl$Int8))$ + select(replaced = pl$col("a")$replace_strict(mapping, return_dtype = pl$Int32))$ to_list(), list(replaced = 1:3) ) @@ -2849,7 +2898,7 @@ test_that("replace works", { df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) expect_equal( df$select( - replaced = pl$col("a")$replace( + replaced = pl$col("a")$replace_strict( old = pl$col("a")$max(), new = pl$col("b")$sum(), default = pl$col("b"), @@ -2903,8 +2952,8 @@ test_that("cut works", { cut = pl$col("foo")$cut(c(-1, 1), include_breaks = TRUE) )$unnest("cut")$to_list(), list( - brk = c(-1, -1, 1, 1, Inf), - foo_bin = factor(c("(-inf, -1]", "(-inf, -1]", "(-1, 1]", "(-1, 1]", "(1, inf]")) + breakpoint = c(-1, -1, 1, 1, Inf), + category = factor(c("(-inf, -1]", "(-inf, -1]", "(-1, 1]", "(-1, 1]", "(1, inf]")) ) ) @@ -2913,8 +2962,8 @@ test_that("cut works", { cut = pl$col("foo")$cut(c(-1, 1), include_breaks = TRUE, left_closed = TRUE) )$unnest("cut")$to_list(), list( - brk = c(-1, 1, 1, Inf, Inf), - foo_bin = factor(c("[-inf, -1)", "[-1, 1)", "[-1, 1)", "[1, inf)", "[1, inf)")) + breakpoint = c(-1, 1, 1, Inf, Inf), + category = factor(c("[-inf, -1)", "[-1, 1)", "[-1, 1)", "[1, inf)", "[1, inf)")) ) ) }) @@ -2933,7 +2982,7 @@ test_that("qcut works", { df$select( qcut = pl$col("foo")$qcut(c(0.25, 0.75), labels = c("a", "b", "c"), include_breaks = TRUE) )$unnest("qcut")$to_list(), - list(brk = c(-1, -1, 1, 1, Inf), foo_bin = factor(c("a", "a", "b", "b", "c"))) + list(breakpoint = c(-1, -1, 1, 1, Inf), category = factor(c("a", "a", "b", "b", "c"))) ) expect_equal( diff --git a/tests/testthat/test-expr_string.R b/tests/testthat/test-expr_string.R index c4dec7167..c8dee7327 100644 --- a/tests/testthat/test-expr_string.R +++ b/tests/testthat/test-expr_string.R @@ -156,22 +156,30 @@ test_that("str$concat", { # concatenate a Series of strings to a single string df = pl$DataFrame(foo = c("1", "a", NA)) expect_identical( - df$select(pl$col("foo")$str$concat())$to_list()[[1]], + df$select(pl$col("foo")$str$join())$to_list()[[1]], "1a" ) expect_identical( - df$select(pl$col("foo")$str$concat("-"))$to_list()[[1]], + df$select(pl$col("foo")$str$join("-"))$to_list()[[1]], "1-a" ) expect_identical( - df$select(pl$col("foo")$str$concat(ignore_nulls = FALSE))$to_list()[[1]], + df$select(pl$col("foo")$str$join(ignore_nulls = FALSE))$to_list()[[1]], NA_character_ ) + # deprecated + expect_warning( + expect_identical( + df$select(pl$col("foo")$str$concat("-"))$to_list()[[1]], + "1-a" + ), + "deprecated" + ) # Series list of strings to Series of concatenated strings df = pl$DataFrame(list(bar = list(c("a", "b", "c"), c("1", "2", "æ")))) expect_identical( - df$select(pl$col("bar")$list$eval(pl$element()$str$concat())$list$first())$to_list()$bar, + df$select(pl$col("bar")$list$eval(pl$element()$str$join())$list$first())$to_list()$bar, sapply(df$to_list()[[1]], paste, collapse = "") ) }) diff --git a/tests/testthat/test-melt.R b/tests/testthat/test-melt.R index 9449a01d3..cf112f374 100644 --- a/tests/testthat/test-melt.R +++ b/tests/testthat/test-melt.R @@ -1,4 +1,4 @@ -patrick::with_parameters_test_that("melt example", +patrick::with_parameters_test_that("unpivot example", { df_1 = pl[[create_func]]( a = c("x", "y", "z"), @@ -9,7 +9,7 @@ patrick::with_parameters_test_that("melt example", expect_true(is_func(df_1)) expect_identical( - df_1$melt(id_vars = "a", value_vars = c("b", "c")) |> as.data.frame(), + df_1$unpivot(index = "a", on = c("b", "c")) |> as.data.frame(), data.frame( a = c("x", "y", "z", "x", "y", "z"), variable = c("b", "b", "b", "c", "c", "c"), @@ -17,7 +17,7 @@ patrick::with_parameters_test_that("melt example", ) ) expect_identical( - df_1$melt(id_vars = c("c", "b"), value_vars = "a") |> as.data.frame(), + df_1$unpivot(index = c("c", "b"), value_vars = "a") |> as.data.frame(), data.frame( c = c(2, 4, 6), b = c(1, 3, 5), @@ -26,7 +26,7 @@ patrick::with_parameters_test_that("melt example", ) ) expect_identical( - df_1$melt(id_vars = c("a", "b"), value_vars = "c") |> as.data.frame(), + df_1$unpivot(index = c("a", "b"), value_vars = "c") |> as.data.frame(), data.frame( a = c("x", "y", "z"), b = c(1, 3, 5), @@ -36,8 +36,8 @@ patrick::with_parameters_test_that("melt example", ) expect_identical( - df_1$melt( - id_vars = c("a", "b"), + df_1$unpivot( + index = c("a", "b"), value_vars = c("c"), value_name = "alice", variable_name = "bob" From 7b174f5fb3e4e21903be970fc2b52bb0dea9ef3a Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 17:18:33 +0200 Subject: [PATCH 24/47] snapshot --- tests/testthat/_snaps/after-wrappers.md | 378 +++++------------------- 1 file changed, 80 insertions(+), 298 deletions(-) diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index be3569a32..6fda14e17 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -257,198 +257,25 @@ [121] "product" "qcut" "quantile" [124] "rank" "rechunk" "reinterpret" [127] "rep" "repeat_by" "replace" - [130] "reshape" "reverse" "rle" - [133] "rle_id" "rolling" "rolling_max" - [136] "rolling_max_by" "rolling_mean" "rolling_mean_by" - [139] "rolling_median" "rolling_median_by" "rolling_min" - [142] "rolling_min_by" "rolling_quantile" "rolling_quantile_by" - [145] "rolling_skew" "rolling_std" "rolling_std_by" - [148] "rolling_sum" "rolling_sum_by" "rolling_var" - [151] "rolling_var_by" "round" "sample" - [154] "search_sorted" "set_sorted" "shift" - [157] "shift_and_fill" "shrink_dtype" "shuffle" - [160] "sign" "sin" "sinh" - [163] "skew" "slice" "sort" - [166] "sort_by" "sqrt" "std" - [169] "str" "struct" "sub" - [172] "sum" "tail" "tan" - [175] "tanh" "to_physical" "to_r" - [178] "to_series" "top_k" "unique" - [181] "unique_counts" "upper_bound" "value_counts" - [184] "var" "xor" - ---- - - Code - ls(.pr[[private_key]]) - Output - [1] "abs" "add" - [3] "agg_groups" "alias" - [5] "all" "and" - [7] "any" "append" - [9] "approx_n_unique" "arccos" - [11] "arccosh" "arcsin" - [13] "arcsinh" "arctan" - [15] "arctanh" "arg_max" - [17] "arg_min" "arg_sort" - [19] "arg_unique" "arr_all" - [21] "arr_any" "arr_arg_max" - [23] "arr_arg_min" "arr_contains" - [25] "arr_count_matches" "arr_get" - [27] "arr_join" "arr_max" - [29] "arr_median" "arr_min" - [31] "arr_reverse" "arr_shift" - [33] "arr_sort" "arr_std" - [35] "arr_sum" "arr_to_list" - [37] "arr_to_struct" "arr_unique" - [39] "arr_var" "backward_fill" - [41] "bin_base64_decode" "bin_base64_encode" - [43] "bin_contains" "bin_ends_with" - [45] "bin_hex_decode" "bin_hex_encode" - [47] "bin_starts_with" "bottom_k" - [49] "cast" "cat_get_categories" - [51] "cat_set_ordering" "ceil" - [53] "clip" "clip_max" - [55] "clip_min" "col" - [57] "cols" "corr" - [59] "cos" "cosh" - [61] "count" "cov" - [63] "cum_count" "cum_max" - [65] "cum_min" "cum_prod" - [67] "cum_sum" "cumulative_eval" - [69] "cut" "diff" - [71] "div" "dot" - [73] "drop_nans" "drop_nulls" - [75] "dt_cast_time_unit" "dt_combine" - [77] "dt_convert_time_zone" "dt_day" - [79] "dt_epoch_seconds" "dt_hour" - [81] "dt_is_leap_year" "dt_iso_year" - [83] "dt_microsecond" "dt_millisecond" - [85] "dt_minute" "dt_month" - [87] "dt_nanosecond" "dt_offset_by" - [89] "dt_ordinal_day" "dt_quarter" - [91] "dt_replace_time_zone" "dt_round" - [93] "dt_second" "dt_strftime" - [95] "dt_time" "dt_timestamp" - [97] "dt_total_days" "dt_total_hours" - [99] "dt_total_microseconds" "dt_total_milliseconds" - [101] "dt_total_minutes" "dt_total_nanoseconds" - [103] "dt_total_seconds" "dt_truncate" - [105] "dt_week" "dt_weekday" - [107] "dt_with_time_unit" "dt_year" - [109] "dtype_cols" "entropy" - [111] "eq" "eq_missing" - [113] "ewm_mean" "ewm_std" - [115] "ewm_var" "exclude" - [117] "exclude_dtype" "exp" - [119] "explode" "extend_constant" - [121] "fill_nan" "fill_null" - [123] "fill_null_with_strategy" "filter" - [125] "first" "flatten" - [127] "floor" "floor_div" - [129] "forward_fill" "gather" - [131] "gather_every" "gt" - [133] "gt_eq" "hash" - [135] "head" "implode" - [137] "interpolate" "is_between" - [139] "is_duplicated" "is_finite" - [141] "is_first_distinct" "is_in" - [143] "is_infinite" "is_last_distinct" - [145] "is_nan" "is_not_nan" - [147] "is_not_null" "is_null" - [149] "is_unique" "kurtosis" - [151] "last" "len" - [153] "list_all" "list_any" - [155] "list_arg_max" "list_arg_min" - [157] "list_contains" "list_diff" - [159] "list_eval" "list_gather" - [161] "list_gather_every" "list_get" - [163] "list_join" "list_len" - [165] "list_max" "list_mean" - [167] "list_min" "list_n_unique" - [169] "list_reverse" "list_set_operation" - [171] "list_shift" "list_slice" - [173] "list_sort" "list_sum" - [175] "list_to_struct" "list_unique" - [177] "lit" "log" - [179] "log10" "lower_bound" - [181] "lt" "lt_eq" - [183] "map_batches" "map_batches_in_background" - [185] "map_elements_in_background" "max" - [187] "mean" "median" - [189] "meta_eq" "meta_has_multiple_outputs" - [191] "meta_is_regex_projection" "meta_output_name" - [193] "meta_pop" "meta_root_names" - [195] "meta_tree_format" "meta_undo_aliases" - [197] "min" "mode" - [199] "mul" "n_unique" - [201] "name_keep" "name_map" - [203] "name_prefix" "name_prefix_fields" - [205] "name_suffix" "name_suffix_fields" - [207] "name_to_lowercase" "name_to_uppercase" - [209] "nan_max" "nan_min" - [211] "neq" "neq_missing" - [213] "new_first" "new_last" - [215] "new_len" "not" - [217] "null_count" "or" - [219] "over" "pct_change" - [221] "peak_max" "peak_min" - [223] "pow" "print" - [225] "product" "qcut" - [227] "qcut_uniform" "quantile" - [229] "rank" "rechunk" - [231] "reinterpret" "rem" - [233] "rep" "repeat_by" - [235] "replace" "replace_strict" - [237] "reshape" "reverse" - [239] "rle" "rle_id" - [241] "rolling" "rolling_corr" - [243] "rolling_cov" "rolling_max" - [245] "rolling_max_by" "rolling_mean" - [247] "rolling_mean_by" "rolling_median" - [249] "rolling_median_by" "rolling_min" - [251] "rolling_min_by" "rolling_quantile" - [253] "rolling_quantile_by" "rolling_skew" - [255] "rolling_std" "rolling_std_by" - [257] "rolling_sum" "rolling_sum_by" - [259] "rolling_var" "rolling_var_by" - [261] "round" "sample_frac" - [263] "sample_n" "search_sorted" - [265] "shift" "shift_and_fill" - [267] "shrink_dtype" "shuffle" - [269] "sign" "sin" - [271] "sinh" "skew" - [273] "slice" "sort_by" - [275] "sort_with" "std" - [277] "str_base64_decode" "str_base64_encode" - [279] "str_contains" "str_contains_any" - [281] "str_count_matches" "str_ends_with" - [283] "str_extract" "str_extract_all" - [285] "str_extract_groups" "str_find" - [287] "str_head" "str_hex_decode" - [289] "str_hex_encode" "str_join" - [291] "str_json_decode" "str_json_path_match" - [293] "str_len_bytes" "str_len_chars" - [295] "str_pad_end" "str_pad_start" - [297] "str_replace" "str_replace_all" - [299] "str_replace_many" "str_reverse" - [301] "str_slice" "str_split" - [303] "str_split_exact" "str_splitn" - [305] "str_starts_with" "str_strip_chars" - [307] "str_strip_chars_end" "str_strip_chars_start" - [309] "str_tail" "str_to_date" - [311] "str_to_datetime" "str_to_integer" - [313] "str_to_lowercase" "str_to_time" - [315] "str_to_titlecase" "str_to_uppercase" - [317] "str_zfill" "struct_field_by_name" - [319] "struct_rename_fields" "struct_with_fields" - [321] "sub" "sum" - [323] "tail" "tan" - [325] "tanh" "to_physical" - [327] "top_k" "unique" - [329] "unique_counts" "unique_stable" - [331] "upper_bound" "value_counts" - [333] "var" "xor" + [130] "replace_strict" "reshape" "reverse" + [133] "rle" "rle_id" "rolling" + [136] "rolling_max" "rolling_max_by" "rolling_mean" + [139] "rolling_mean_by" "rolling_median" "rolling_median_by" + [142] "rolling_min" "rolling_min_by" "rolling_quantile" + [145] "rolling_quantile_by" "rolling_skew" "rolling_std" + [148] "rolling_std_by" "rolling_sum" "rolling_sum_by" + [151] "rolling_var" "rolling_var_by" "round" + [154] "sample" "search_sorted" "set_sorted" + [157] "shift" "shift_and_fill" "shrink_dtype" + [160] "shuffle" "sign" "sin" + [163] "sinh" "skew" "slice" + [166] "sort" "sort_by" "sqrt" + [169] "std" "str" "struct" + [172] "sub" "sum" "tail" + [175] "tan" "tanh" "to_physical" + [178] "to_r" "to_series" "top_k" + [181] "unique" "unique_counts" "upper_bound" + [184] "value_counts" "var" "xor" # public and private methods of each class When @@ -512,33 +339,26 @@ [121] "print" "product" "qcut" [124] "quantile" "rank" "rechunk" [127] "reinterpret" "rep" "repeat_by" - [130] "replace" "reshape" "reverse" - [133] "rle" "rle_id" "rolling" - [136] "rolling_max" "rolling_max_by" "rolling_mean" - [139] "rolling_mean_by" "rolling_median" "rolling_median_by" - [142] "rolling_min" "rolling_min_by" "rolling_quantile" - [145] "rolling_quantile_by" "rolling_skew" "rolling_std" - [148] "rolling_std_by" "rolling_sum" "rolling_sum_by" - [151] "rolling_var" "rolling_var_by" "round" - [154] "sample" "search_sorted" "set_sorted" - [157] "shift" "shift_and_fill" "shrink_dtype" - [160] "shuffle" "sign" "sin" - [163] "sinh" "skew" "slice" - [166] "sort" "sort_by" "sqrt" - [169] "std" "str" "struct" - [172] "sub" "sum" "tail" - [175] "tan" "tanh" "to_physical" - [178] "to_r" "to_series" "top_k" - [181] "unique" "unique_counts" "upper_bound" - [184] "value_counts" "var" "when" - [187] "xor" - ---- - - Code - ls(.pr[[private_key]]) - Output - [1] "otherwise" "when" + [130] "replace" "replace_strict" "reshape" + [133] "reverse" "rle" "rle_id" + [136] "rolling" "rolling_max" "rolling_max_by" + [139] "rolling_mean" "rolling_mean_by" "rolling_median" + [142] "rolling_median_by" "rolling_min" "rolling_min_by" + [145] "rolling_quantile" "rolling_quantile_by" "rolling_skew" + [148] "rolling_std" "rolling_std_by" "rolling_sum" + [151] "rolling_sum_by" "rolling_var" "rolling_var_by" + [154] "round" "sample" "search_sorted" + [157] "set_sorted" "shift" "shift_and_fill" + [160] "shrink_dtype" "shuffle" "sign" + [163] "sin" "sinh" "skew" + [166] "slice" "sort" "sort_by" + [169] "sqrt" "std" "str" + [172] "struct" "sub" "sum" + [175] "tail" "tan" "tanh" + [178] "to_physical" "to_r" "to_series" + [181] "top_k" "unique" "unique_counts" + [184] "upper_bound" "value_counts" "var" + [187] "when" "xor" # public and private methods of each class ChainedWhen @@ -602,33 +422,26 @@ [121] "print" "product" "qcut" [124] "quantile" "rank" "rechunk" [127] "reinterpret" "rep" "repeat_by" - [130] "replace" "reshape" "reverse" - [133] "rle" "rle_id" "rolling" - [136] "rolling_max" "rolling_max_by" "rolling_mean" - [139] "rolling_mean_by" "rolling_median" "rolling_median_by" - [142] "rolling_min" "rolling_min_by" "rolling_quantile" - [145] "rolling_quantile_by" "rolling_skew" "rolling_std" - [148] "rolling_std_by" "rolling_sum" "rolling_sum_by" - [151] "rolling_var" "rolling_var_by" "round" - [154] "sample" "search_sorted" "set_sorted" - [157] "shift" "shift_and_fill" "shrink_dtype" - [160] "shuffle" "sign" "sin" - [163] "sinh" "skew" "slice" - [166] "sort" "sort_by" "sqrt" - [169] "std" "str" "struct" - [172] "sub" "sum" "tail" - [175] "tan" "tanh" "to_physical" - [178] "to_r" "to_series" "top_k" - [181] "unique" "unique_counts" "upper_bound" - [184] "value_counts" "var" "when" - [187] "xor" - ---- - - Code - ls(.pr[[private_key]]) - Output - [1] "otherwise" "when" + [130] "replace" "replace_strict" "reshape" + [133] "reverse" "rle" "rle_id" + [136] "rolling" "rolling_max" "rolling_max_by" + [139] "rolling_mean" "rolling_mean_by" "rolling_median" + [142] "rolling_median_by" "rolling_min" "rolling_min_by" + [145] "rolling_quantile" "rolling_quantile_by" "rolling_skew" + [148] "rolling_std" "rolling_std_by" "rolling_sum" + [151] "rolling_sum_by" "rolling_var" "rolling_var_by" + [154] "round" "sample" "search_sorted" + [157] "set_sorted" "shift" "shift_and_fill" + [160] "shrink_dtype" "shuffle" "sign" + [163] "sin" "sinh" "skew" + [166] "slice" "sort" "sort_by" + [169] "sqrt" "std" "str" + [172] "struct" "sub" "sum" + [175] "tail" "tan" "tanh" + [178] "to_physical" "to_r" "to_series" + [181] "top_k" "unique" "unique_counts" + [184] "upper_bound" "value_counts" "var" + [187] "when" "xor" # public and private methods of each class RField @@ -695,58 +508,27 @@ [127] "product" "qcut" "quantile" [130] "rank" "rechunk" "reinterpret" [133] "rename" "rep" "repeat_by" - [136] "replace" "reshape" "reverse" - [139] "rle" "rle_id" "rolling_max" - [142] "rolling_max_by" "rolling_mean" "rolling_mean_by" - [145] "rolling_median" "rolling_median_by" "rolling_min" - [148] "rolling_min_by" "rolling_quantile" "rolling_quantile_by" - [151] "rolling_skew" "rolling_std" "rolling_std_by" - [154] "rolling_sum" "rolling_sum_by" "rolling_var" - [157] "rolling_var_by" "round" "sample" - [160] "search_sorted" "set_sorted" "shape" - [163] "shift" "shift_and_fill" "shrink_dtype" - [166] "shuffle" "sign" "sin" - [169] "sinh" "skew" "slice" - [172] "sort" "sort_by" "sqrt" - [175] "std" "str" "struct" - [178] "sub" "sum" "tail" - [181] "tan" "tanh" "to_frame" - [184] "to_list" "to_lit" "to_physical" - [187] "to_r" "to_vector" "top_k" - [190] "unique" "unique_counts" "upper_bound" - [193] "value_counts" "var" "xor" - ---- - - Code - ls(.pr[[private_key]]) - Output - [1] "add" "alias" - [3] "all" "any" - [5] "append_mut" "arg_max" - [7] "arg_min" "can_fast_explode_flag" - [9] "chunk_lengths" "clear" - [11] "clone" "compare" - [13] "div" "dtype" - [15] "equals" "export_stream" - [17] "from_arrow_array_robj" "get_fmt" - [19] "import_stream" "is_sorted" - [21] "is_sorted_ascending_flag" "is_sorted_descending_flag" - [23] "len" "map_elements" - [25] "max" "mean" - [27] "median" "min" - [29] "mul" "n_chunks" - [31] "n_unique" "name" - [33] "new" "panic" - [35] "print" "rem" - [37] "rename_mut" "rep" - [39] "set_sorted_mut" "shape" - [41] "sleep" "sort" - [43] "std" "struct_fields" - [45] "sub" "sum" - [47] "to_fmt_char" "to_frame" - [49] "to_r" "value_counts" - [51] "var" + [136] "replace" "replace_strict" "reshape" + [139] "reverse" "rle" "rle_id" + [142] "rolling_max" "rolling_max_by" "rolling_mean" + [145] "rolling_mean_by" "rolling_median" "rolling_median_by" + [148] "rolling_min" "rolling_min_by" "rolling_quantile" + [151] "rolling_quantile_by" "rolling_skew" "rolling_std" + [154] "rolling_std_by" "rolling_sum" "rolling_sum_by" + [157] "rolling_var" "rolling_var_by" "round" + [160] "sample" "search_sorted" "set_sorted" + [163] "shape" "shift" "shift_and_fill" + [166] "shrink_dtype" "shuffle" "sign" + [169] "sin" "sinh" "skew" + [172] "slice" "sort" "sort_by" + [175] "sqrt" "std" "str" + [178] "struct" "sub" "sum" + [181] "tail" "tan" "tanh" + [184] "to_frame" "to_list" "to_lit" + [187] "to_physical" "to_r" "to_vector" + [190] "top_k" "unique" "unique_counts" + [193] "upper_bound" "value_counts" "var" + [196] "xor" # public and private methods of each class RThreadHandle From bfb59c71f14c7246e79ab859dda5fde40c0a3f74 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 17:41:35 +0200 Subject: [PATCH 25/47] more fixes [skip ci] --- R/extendr-wrappers.R | 2 +- src/rust/src/rdataframe/mod.rs | 3 +- tests/testthat/_snaps/after-wrappers.md | 219 ++++++++++++++++++++++++ tests/testthat/test-expr_datetime.R | 15 +- tests/testthat/test-expr_expr.R | 8 +- 5 files changed, 228 insertions(+), 19 deletions(-) diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 98c5e099e..dda668b79 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -208,7 +208,7 @@ RPolarsDataFrame$estimated_size <- function() .Call(wrap__RPolarsDataFrame__esti RPolarsDataFrame$null_count <- function() .Call(wrap__RPolarsDataFrame__null_count, self) -RPolarsDataFrame$unpivot <- function(on, index, value_name, variable_name, streamable) .Call(wrap__RPolarsDataFrame__unpivot, self, on, index, value_name, variable_name, streamable) +RPolarsDataFrame$unpivot <- function(on, index, value_name, variable_name) .Call(wrap__RPolarsDataFrame__unpivot, self, on, index, value_name, variable_name) RPolarsDataFrame$pivot_expr <- function(on, index, values, maintain_order, sort_columns, aggregate_expr, separator) .Call(wrap__RPolarsDataFrame__pivot_expr, self, on, index, values, maintain_order, sort_columns, aggregate_expr, separator) diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index bc2302c3c..925ad4428 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -388,14 +388,13 @@ impl RPolarsDataFrame { index: Robj, value_name: Robj, variable_name: Robj, - streamable: Robj, ) -> RResult { let args = UnpivotArgs { on: strings_to_smartstrings(robj_to!(Vec, String, on)?), index: strings_to_smartstrings(robj_to!(Vec, String, index)?), value_name: robj_to!(Option, String, value_name)?.map(|s| s.into()), variable_name: robj_to!(Option, String, variable_name)?.map(|s| s.into()), - streamable: robj_to!(bool, streamable)?, + streamable: false, }; self.0 diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index 6fda14e17..29f35f8c3 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -277,6 +277,179 @@ [181] "unique" "unique_counts" "upper_bound" [184] "value_counts" "var" "xor" +--- + + Code + ls(.pr[[private_key]]) + Output + [1] "abs" "add" + [3] "agg_groups" "alias" + [5] "all" "and" + [7] "any" "append" + [9] "approx_n_unique" "arccos" + [11] "arccosh" "arcsin" + [13] "arcsinh" "arctan" + [15] "arctanh" "arg_max" + [17] "arg_min" "arg_sort" + [19] "arg_unique" "arr_all" + [21] "arr_any" "arr_arg_max" + [23] "arr_arg_min" "arr_contains" + [25] "arr_count_matches" "arr_get" + [27] "arr_join" "arr_max" + [29] "arr_median" "arr_min" + [31] "arr_reverse" "arr_shift" + [33] "arr_sort" "arr_std" + [35] "arr_sum" "arr_to_list" + [37] "arr_to_struct" "arr_unique" + [39] "arr_var" "backward_fill" + [41] "bin_base64_decode" "bin_base64_encode" + [43] "bin_contains" "bin_ends_with" + [45] "bin_hex_decode" "bin_hex_encode" + [47] "bin_starts_with" "bottom_k" + [49] "cast" "cat_get_categories" + [51] "cat_set_ordering" "ceil" + [53] "clip" "clip_max" + [55] "clip_min" "col" + [57] "cols" "corr" + [59] "cos" "cosh" + [61] "count" "cov" + [63] "cum_count" "cum_max" + [65] "cum_min" "cum_prod" + [67] "cum_sum" "cumulative_eval" + [69] "cut" "diff" + [71] "div" "dot" + [73] "drop_nans" "drop_nulls" + [75] "dt_cast_time_unit" "dt_combine" + [77] "dt_convert_time_zone" "dt_day" + [79] "dt_epoch_seconds" "dt_hour" + [81] "dt_is_leap_year" "dt_iso_year" + [83] "dt_microsecond" "dt_millisecond" + [85] "dt_minute" "dt_month" + [87] "dt_nanosecond" "dt_offset_by" + [89] "dt_ordinal_day" "dt_quarter" + [91] "dt_replace_time_zone" "dt_round" + [93] "dt_second" "dt_strftime" + [95] "dt_time" "dt_timestamp" + [97] "dt_total_days" "dt_total_hours" + [99] "dt_total_microseconds" "dt_total_milliseconds" + [101] "dt_total_minutes" "dt_total_nanoseconds" + [103] "dt_total_seconds" "dt_truncate" + [105] "dt_week" "dt_weekday" + [107] "dt_with_time_unit" "dt_year" + [109] "dtype_cols" "entropy" + [111] "eq" "eq_missing" + [113] "ewm_mean" "ewm_std" + [115] "ewm_var" "exclude" + [117] "exclude_dtype" "exp" + [119] "explode" "extend_constant" + [121] "fill_nan" "fill_null" + [123] "fill_null_with_strategy" "filter" + [125] "first" "flatten" + [127] "floor" "floor_div" + [129] "forward_fill" "gather" + [131] "gather_every" "gt" + [133] "gt_eq" "hash" + [135] "head" "implode" + [137] "interpolate" "is_between" + [139] "is_duplicated" "is_finite" + [141] "is_first_distinct" "is_in" + [143] "is_infinite" "is_last_distinct" + [145] "is_nan" "is_not_nan" + [147] "is_not_null" "is_null" + [149] "is_unique" "kurtosis" + [151] "last" "len" + [153] "list_all" "list_any" + [155] "list_arg_max" "list_arg_min" + [157] "list_contains" "list_diff" + [159] "list_eval" "list_gather" + [161] "list_gather_every" "list_get" + [163] "list_join" "list_len" + [165] "list_max" "list_mean" + [167] "list_min" "list_n_unique" + [169] "list_reverse" "list_set_operation" + [171] "list_shift" "list_slice" + [173] "list_sort" "list_sum" + [175] "list_to_struct" "list_unique" + [177] "lit" "log" + [179] "log10" "lower_bound" + [181] "lt" "lt_eq" + [183] "map_batches" "map_batches_in_background" + [185] "map_elements_in_background" "max" + [187] "mean" "median" + [189] "meta_eq" "meta_has_multiple_outputs" + [191] "meta_is_regex_projection" "meta_output_name" + [193] "meta_pop" "meta_root_names" + [195] "meta_tree_format" "meta_undo_aliases" + [197] "min" "mode" + [199] "mul" "n_unique" + [201] "name_keep" "name_map" + [203] "name_prefix" "name_prefix_fields" + [205] "name_suffix" "name_suffix_fields" + [207] "name_to_lowercase" "name_to_uppercase" + [209] "nan_max" "nan_min" + [211] "neq" "neq_missing" + [213] "new_first" "new_last" + [215] "new_len" "not" + [217] "null_count" "or" + [219] "over" "pct_change" + [221] "peak_max" "peak_min" + [223] "pow" "print" + [225] "product" "qcut" + [227] "qcut_uniform" "quantile" + [229] "rank" "rechunk" + [231] "reinterpret" "rem" + [233] "rep" "repeat_by" + [235] "replace" "replace_strict" + [237] "reshape" "reverse" + [239] "rle" "rle_id" + [241] "rolling" "rolling_corr" + [243] "rolling_cov" "rolling_max" + [245] "rolling_max_by" "rolling_mean" + [247] "rolling_mean_by" "rolling_median" + [249] "rolling_median_by" "rolling_min" + [251] "rolling_min_by" "rolling_quantile" + [253] "rolling_quantile_by" "rolling_skew" + [255] "rolling_std" "rolling_std_by" + [257] "rolling_sum" "rolling_sum_by" + [259] "rolling_var" "rolling_var_by" + [261] "round" "sample_frac" + [263] "sample_n" "search_sorted" + [265] "shift" "shift_and_fill" + [267] "shrink_dtype" "shuffle" + [269] "sign" "sin" + [271] "sinh" "skew" + [273] "slice" "sort_by" + [275] "sort_with" "std" + [277] "str_base64_decode" "str_base64_encode" + [279] "str_contains" "str_contains_any" + [281] "str_count_matches" "str_ends_with" + [283] "str_extract" "str_extract_all" + [285] "str_extract_groups" "str_find" + [287] "str_head" "str_hex_decode" + [289] "str_hex_encode" "str_join" + [291] "str_json_decode" "str_json_path_match" + [293] "str_len_bytes" "str_len_chars" + [295] "str_pad_end" "str_pad_start" + [297] "str_replace" "str_replace_all" + [299] "str_replace_many" "str_reverse" + [301] "str_slice" "str_split" + [303] "str_split_exact" "str_splitn" + [305] "str_starts_with" "str_strip_chars" + [307] "str_strip_chars_end" "str_strip_chars_start" + [309] "str_tail" "str_to_date" + [311] "str_to_datetime" "str_to_integer" + [313] "str_to_lowercase" "str_to_time" + [315] "str_to_titlecase" "str_to_uppercase" + [317] "str_zfill" "struct_field_by_name" + [319] "struct_rename_fields" "struct_with_fields" + [321] "sub" "sum" + [323] "tail" "tan" + [325] "tanh" "to_physical" + [327] "top_k" "unique" + [329] "unique_counts" "unique_stable" + [331] "upper_bound" "value_counts" + [333] "var" "xor" + # public and private methods of each class When Code @@ -360,6 +533,13 @@ [184] "upper_bound" "value_counts" "var" [187] "when" "xor" +--- + + Code + ls(.pr[[private_key]]) + Output + [1] "otherwise" "when" + # public and private methods of each class ChainedWhen Code @@ -443,6 +623,13 @@ [184] "upper_bound" "value_counts" "var" [187] "when" "xor" +--- + + Code + ls(.pr[[private_key]]) + Output + [1] "otherwise" "when" + # public and private methods of each class RField Code @@ -530,6 +717,38 @@ [193] "upper_bound" "value_counts" "var" [196] "xor" +--- + + Code + ls(.pr[[private_key]]) + Output + [1] "add" "alias" + [3] "all" "any" + [5] "append_mut" "arg_max" + [7] "arg_min" "can_fast_explode_flag" + [9] "chunk_lengths" "clear" + [11] "clone" "compare" + [13] "div" "dtype" + [15] "equals" "export_stream" + [17] "from_arrow_array_robj" "get_fmt" + [19] "import_stream" "is_sorted" + [21] "is_sorted_ascending_flag" "is_sorted_descending_flag" + [23] "len" "map_elements" + [25] "max" "mean" + [27] "median" "min" + [29] "mul" "n_chunks" + [31] "n_unique" "name" + [33] "new" "panic" + [35] "print" "rem" + [37] "rename_mut" "rep" + [39] "set_sorted_mut" "shape" + [41] "sleep" "sort" + [43] "std" "struct_fields" + [45] "sub" "sum" + [47] "to_fmt_char" "to_frame" + [49] "to_r" "value_counts" + [51] "var" + # public and private methods of each class RThreadHandle Code diff --git a/tests/testthat/test-expr_datetime.R b/tests/testthat/test-expr_datetime.R index 531066489..75627cec8 100644 --- a/tests/testthat/test-expr_datetime.R +++ b/tests/testthat/test-expr_datetime.R @@ -100,8 +100,7 @@ test_that("dt$truncate", { # use a dt namespace function df = pl$DataFrame(datetime = s)$with_columns( - pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s"), - pl$col("datetime")$dt$truncate("4s", offset("3s"))$alias("truncated_4s_offset_2s") + pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s") ) l_actual = df$to_list() @@ -128,11 +127,8 @@ test_that("dt$round", { s = pl$datetime_range(t1, t2, interval = "2s", time_unit = "ms") # use a dt namespace function - ## TODO contribute POLARS, offset makes little sense, it should be implemented - ## before round not after. df = pl$DataFrame(datetime = s)$with_columns( - pl$col("datetime")$dt$round("8s")$alias("truncated_4s"), - pl$col("datetime")$dt$round("8s", offset("4s1ms"))$alias("truncated_4s_offset_2s") + pl$col("datetime")$dt$round("8s")$alias("truncated_4s") ) l_actual = df$to_list() @@ -140,8 +136,7 @@ test_that("dt$round", { lapply(l_actual, \(x) diff(x) |> as.numeric()), list( datetime = rep(2, 12), - truncated_4s = rep(c(0, 8, 0, 0), 3), - truncated_4s_offset_2s = rep(c(0, 8, 0, 0), 3) + truncated_4s = rep(c(0, 8, 0, 0), 3) ) ) @@ -153,10 +148,6 @@ test_that("dt$round", { pl$col("datetime")$dt$round(c("2s", "1h")), "`every` must be a single non-NA character or difftime" ) - expect_grepl_error( - pl$col("datetime")$dt$round("1s", 42), - "`offset` must be a single non-NA character or difftime" - ) }) test_that("dt$combine", { diff --git a/tests/testthat/test-expr_expr.R b/tests/testthat/test-expr_expr.R index ac49e68ff..e45629ba1 100644 --- a/tests/testthat/test-expr_expr.R +++ b/tests/testthat/test-expr_expr.R @@ -896,14 +896,14 @@ test_that("Expr_sort", { }) -test_that("Expr_k_top", { +test_that("$top_k() works", { l = list(a = c(6, 1, 0, NA, Inf, -Inf, NaN)) l_actual = pl$DataFrame(l)$select( pl$col("a")$top_k(3)$alias("k_top"), pl$col("a")$bottom_k(3)$alias("k_bot") ) - known = structure(list(k_top = c(NaN, Inf, 6), k_bot = c(NA, -Inf, 0)), + known = structure(list(k_top = c(NaN, Inf, 6), k_bot = c(-Inf, 0, 1)), row.names = c(NA, -3L), class = "data.frame" ) expect_equal(l_actual$to_data_frame(), known) @@ -2913,8 +2913,8 @@ test_that("rle works", { expect_equal( df$select(pl$col("s")$rle())$unnest("s")$to_data_frame(), data.frame( - lengths = c(2, 1, 1, 1, 1, 2), - values = c(1, 2, 1, NA, 1, 3) + len = c(2, 1, 1, 1, 1, 2), + value = c(1, 2, 1, NA, 1, 3) ) ) }) From 6f619a7baf62043e8326c0e87a966b8d5619bc69 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 21:33:24 +0200 Subject: [PATCH 26/47] fix slice, test arg "order_by" in $over() --- R/expr__expr.R | 24 ++++++++++++++++++------ man/Expr_over.Rd | 17 ++++++++++++++++- man/Expr_slice.Rd | 2 -- src/rust/src/lazy/dsl.rs | 13 +++++++++---- tests/testthat/test-expr_datetime.R | 8 +------- tests/testthat/test-expr_expr.R | 17 ++++++++++++++++- 6 files changed, 60 insertions(+), 21 deletions(-) diff --git a/R/expr__expr.R b/R/expr__expr.R index a413b6b51..c0d4a44d1 100644 --- a/R/expr__expr.R +++ b/R/expr__expr.R @@ -1163,11 +1163,7 @@ Expr_is_not_nan = use_extendr_wrapper #' full data. #' #' @return Expr -#' @aliases slice -#' @name Expr_slice -#' @format NULL #' @examples -#' #' # as head #' pl$DataFrame(list(a = 0:100))$select( #' pl$all()$slice(0, 6) @@ -1185,7 +1181,8 @@ Expr_is_not_nan = use_extendr_wrapper #' # recycling #' pl$DataFrame(mtcars)$with_columns(pl$col("mpg")$slice(0, 1)) Expr_slice = function(offset, length = NULL) { - .pr$Expr$slice(self, wrap_e(offset), wrap_e(length)) + .pr$Expr$slice(self, offset, wrap_e(length)) |> + unwrap("in $slice():") } @@ -1840,7 +1837,8 @@ Expr_last = use_extendr_wrapper #' @param ... Column(s) to group by. Accepts expression input. #' Characters are parsed as column names. #' @param order_by Order the window functions/aggregations with the partitioned -#' groups by the result of the expression passed to `order_by`. +#' groups by the result of the expression passed to `order_by`. Can be an Expr. +#' Strings are parsed as column names. #' @param mapping_strategy One of the following: #' * `"group_to_rows"` (default): if the aggregation results in multiple values, #' assign them back to their position in the DataFrame. This can only be done @@ -1886,6 +1884,20 @@ Expr_last = use_extendr_wrapper #' df$with_columns( #' top_2 = pl$col("c")$top_k(2)$over("a", mapping_strategy = "join") #' ) +#' +#' # order_by specifies how values are sorted within a group, which is +#' # essential when the operation depends on the order of values +#' df = pl$DataFrame( +#' g = c(1, 1, 1, 1, 2, 2, 2, 2), +#' t = c(1, 2, 3, 4, 4, 1, 2, 3), +#' x = c(10, 20, 30, 40, 10, 20, 30, 40) +#' ) +#' +#' # without order_by, the first and second values in the second group would +#' # be inverted, which would be wrong +#' df$with_columns( +#' x_lag = pl$col("x")$shift(1)$over("g", order_by = "t") +#' ) Expr_over = function(..., order_by = NULL, mapping_strategy = "group_to_rows") { list_of_exprs = list2(...) |> lapply(\(x) { diff --git a/man/Expr_over.Rd b/man/Expr_over.Rd index 4559f34e7..bfa3ba9b2 100644 --- a/man/Expr_over.Rd +++ b/man/Expr_over.Rd @@ -11,7 +11,8 @@ Expr_over(..., order_by = NULL, mapping_strategy = "group_to_rows") Characters are parsed as column names.} \item{order_by}{Order the window functions/aggregations with the partitioned -groups by the result of the expression passed to \code{order_by}.} +groups by the result of the expression passed to \code{order_by}. Can be an Expr. +Strings are parsed as column names.} \item{mapping_strategy}{One of the following: \itemize{ @@ -69,4 +70,18 @@ df$with_columns( df$with_columns( top_2 = pl$col("c")$top_k(2)$over("a", mapping_strategy = "join") ) + +# order_by specifies how values are sorted within a group, which is +# essential when the operation depends on the order of values +df = pl$DataFrame( + g = c(1, 1, 1, 1, 2, 2, 2, 2), + t = c(1, 2, 3, 4, 4, 1, 2, 3), + x = c(10, 20, 30, 40, 10, 20, 30, 40) +) + +# without order_by, the first and second values in the second group would +# be inverted, which would be wrong +df$with_columns( + x_lag = pl$col("x")$shift(1)$over("g", order_by = "t") +) } diff --git a/man/Expr_slice.Rd b/man/Expr_slice.Rd index 96420196b..28fa0a8a1 100644 --- a/man/Expr_slice.Rd +++ b/man/Expr_slice.Rd @@ -2,7 +2,6 @@ % Please edit documentation in R/expr__expr.R \name{Expr_slice} \alias{Expr_slice} -\alias{slice} \title{Get a slice of an Expr} \usage{ Expr_slice(offset, length = NULL) @@ -23,7 +22,6 @@ in those columns but will not change the number of rows in the data. See examples. } \examples{ - # as head pl$DataFrame(list(a = 0:100))$select( pl$all()$slice(0, 6) diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index d39975449..88d203d2e 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -1790,12 +1790,17 @@ impl RPolarsExpr { self.0.clone().len().into() } - pub fn slice(&self, offset: &RPolarsExpr, length: Nullable<&RPolarsExpr>) -> Self { + pub fn slice(&self, offset: Robj, length: Nullable<&RPolarsExpr>) -> RResult { + let offset = robj_to!(PLExpr, offset)?; let length = match null_to_opt(length) { - Some(i) => i.0.clone(), + Some(i) => dsl::cast(i.0.clone(), pl::DataType::Int64), None => dsl::lit(i64::MAX), }; - self.0.clone().slice(offset.0.clone(), length).into() + Ok(self + .0 + .clone() + .slice(dsl::cast(offset, pl::DataType::Int64), length) + .into()) } pub fn append(&self, other: &RPolarsExpr, upcast: bool) -> Self { @@ -1913,7 +1918,7 @@ impl RPolarsExpr { ) -> RResult { let partition_by = robj_to!(Vec, PLExpr, partition_by)?; - let order_by = robj_to!(Option, Vec, PLExpr, order_by)?.map(|order_by| { + let order_by = robj_to!(Option, Vec, PLExprCol, order_by)?.map(|order_by| { ( order_by, SortOptions { diff --git a/tests/testthat/test-expr_datetime.R b/tests/testthat/test-expr_datetime.R index 75627cec8..f79c7f37c 100644 --- a/tests/testthat/test-expr_datetime.R +++ b/tests/testthat/test-expr_datetime.R @@ -108,15 +108,9 @@ test_that("dt$truncate", { lapply(l_actual, \(x) diff(x) |> as.numeric()), list( datetime = rep(2, 12), - truncated_4s = rep(c(0, 4), 6), - truncated_4s_offset_2s = rep(c(0, 4), 6) + truncated_4s = rep(c(0, 4), 6) ) ) - - expect_identical( - as.numeric(l_actual$truncated_4s_offset_2s - l_actual$truncated_4s), - rep(3, 13) - ) }) diff --git a/tests/testthat/test-expr_expr.R b/tests/testthat/test-expr_expr.R index e45629ba1..4b46b81f2 100644 --- a/tests/testthat/test-expr_expr.R +++ b/tests/testthat/test-expr_expr.R @@ -324,11 +324,26 @@ test_that("$over() with mapping_strategy", { expect_identical( df$select(pl$col("val")$top_k(2)$over("a", mapping_strategy = "join"))$to_list(), list( - val = list(c(5L, 2L), c(5L, 2L), c(4L, 3L), c(4L, 3L), c(5L, 2L)) + val = list(c(5L, 2L), c(5L, 2L), c(3L, 4L), c(3L, 4L), c(5L, 2L)) ) ) }) +test_that("arg 'order_by' in $over() works", { + df = pl$DataFrame( + g = c(1, 1, 1, 1, 2, 2, 2, 2), + t = c(1, 2, 3, 4, 4, 1, 2, 3), + x = c(10, 20, 30, 40, 10, 20, 30, 40) + ) + + expect_equal( + df$select( + x_lag = pl$col("x")$shift(1)$over("g", order_by = "t") + )$to_list(), + list(x_lag = c(NA, 10, 20, 30, 40, NA, 20, 30)) + ) +}) + test_that("col DataType + col(s) + col regex", { # one Datatype expect_equal( From 7dcc8af135c289bbc0d71754a0ce1ba57accbd30 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 21:34:59 +0200 Subject: [PATCH 27/47] minor --- R/dataframe__frame.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index e204b1c86..ba42a7cae 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1763,12 +1763,12 @@ DataFrame_glimpse = function(..., return_as_string = FALSE) { } # closure to extract col info from a column in - max_num_value = as.integer(min(10, self$height)) + max_num_value = min(10, self$height) max_col_name_trunc = 50 parse_column_ = \(col_name, dtype) { dtype_str = dtype_str_repr(dtype) |> unwrap_or(paste0("??", str_string(dtype))) if (inherits(dtype, "RPolarsDataType")) dtype_str = paste0(" <", dtype_str, ">") - val = self$select(pl$col(col_name)$slice(0L, max_num_value))$to_list()[[1]] + val = self$select(pl$col(col_name)$slice(0, max_num_value))$to_list()[[1]] val_str = paste(val, collapse = ", ") if (nchar(col_name) > max_col_name_trunc) { col_name = paste0(substr(col_name, 1, max_col_name_trunc - 3), "...") From b9f0972998b6744de25a91ce8403edae4a243fdf Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 21:43:18 +0200 Subject: [PATCH 28/47] bump rust-version and crate version --- src/rust/Cargo.lock | 2 +- src/rust/Cargo.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index 66a49ee73..d1e550ba2 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -2320,7 +2320,7 @@ dependencies = [ [[package]] name = "r-polars" -version = "0.40.1" +version = "0.41.0" dependencies = [ "either", "extendr-api", diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index 1a1ee34af..30592eb67 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "r-polars" -version = "0.40.1" +version = "0.41.0" edition = "2021" -rust-version = "1.77.0" +rust-version = "1.79.0" publish = false [lib] From 07710567749077e1ba6239ce2efca135b3e35362 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 23 Jun 2024 22:15:19 +0200 Subject: [PATCH 29/47] fix vignettes --- vignettes/polars.Rmd | 6 +- vignettes/userguide.Rmd | 143 ++++++++++++++++++++-------------------- 2 files changed, 73 insertions(+), 76 deletions(-) diff --git a/vignettes/polars.Rmd b/vignettes/polars.Rmd index 2f5d1f3b9..1d4d550d6 100644 --- a/vignettes/polars.Rmd +++ b/vignettes/polars.Rmd @@ -333,7 +333,7 @@ To go from long to wide, we use the `$pivot()` method. Here we pivot the data so that every subject takes its own column. ```{r} -indo_wide = indo$pivot(values = "conc", index = "time", columns = "Subject") +indo_wide = indo$pivot(values = "conc", index = "time", on = "Subject") indo_wide ``` @@ -341,7 +341,7 @@ To go from wide to long, we use the `$melt()` method. ```{r} # indo_wide$melt(id_vars = "time") # default column names are "variable" and "value" -indo_wide$melt(id_vars = "time", variable_name = "subject", value_name = "conc") +indo_wide$unpivot(index = "time", variable_name = "subject", value_name = "conc") ``` Basic functionality aside, it should be noted that `$pivot()` can perform @@ -356,7 +356,7 @@ different combinations of transmission type (`am`) and engine shape (`vs`)? dat$pivot( values = "mpg", index = c("am", "vs"), - columns = "cyl", + on = "cyl", aggregate_function = "median" # aggregating function ) ``` diff --git a/vignettes/userguide.Rmd b/vignettes/userguide.Rmd index 0e7857b82..fe988a4d9 100755 --- a/vignettes/userguide.Rmd +++ b/vignettes/userguide.Rmd @@ -271,77 +271,76 @@ dataset$ collect() ``` -````{comment} -```{r} -compute_age = function() 2021 - pl$col("birthday")$dt$year() + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -avg_birthday = function(gender) { - compute_age()$filter(pl$col("gender") == gender)$mean()$alias(sprintf("avg %s birthday", gender)) -} - -q = ( - dataset$lazy()$ - group_by("state")$ - agg( - avg_birthday("M"), - avg_birthday("F"), - (pl$col("gender") == "M")$sum()$alias("# male"), - (pl$col("gender") == "F")$sum()$alias("# female") - )$ - limit(5) -) -q$collect() - -# -# get_person <- function() pl$col("first_name") + pl$lit(" ") + pl$col("last_name") -# q = ( -# dataset$lazy() -# $sort("birthday", descending=True) -# $group_by(["state"]) -# $agg( -# [ -# get_person()$first()$alias("youngest"), -# get_person()$last()$alias("oldest"), -# ] -# ) -# $limit(5) -# ) -# q$collect() -# -# get_person <- function() pl$col("first_name") + pl$lit(" ") + pl$col("last_name") -# q = ( -# dataset$lazy() -# $sort("birthday", descending=True) -# $group_by(["state"]) -# $agg( -# [ -# get_person()$first()$alias("youngest"), -# get_person()$last()$alias("oldest"), -# get_person()$sort()$first()$alias("alphabetical_first"), -# ] -# ) -# $limit(5) -# ) -# q$collect() -# -# q = ( -# dataset$lazy() -# $sort("birthday", descending=True) -# $group_by(["state"]) -# $agg( -# [ -# get_person()$first()$alias("youngest"), -# get_person()$last()$alias("oldest"), -# get_person()$sort()$first()$alias("alphabetical_first"), -# pl$col("gender")$sort_by("first_name")$first()$alias("gender"), -# ] -# ) -# $sort("state") -# $limit(5) -# ) -# q$collect() -``` -```` ## Folds @@ -451,8 +450,6 @@ df$sort("Type 1")$select( # List context and row wise computations ````{comment} - -``` grades = pl$DataFrame( "student" = c("bas", "laura", "tim", "jenny"), "arithmetic" = c(10, 5, 6, 8), @@ -520,7 +517,7 @@ out = df$select( ) print(out) ``` -```` + # R examples From d3c87356273c5af9c1a51663d8b798be6330d14d Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Mon, 24 Jun 2024 21:14:09 +0200 Subject: [PATCH 30/47] bump to 0.41.2 --- src/rust/Cargo.lock | 72 +++++++++++++++++----------------- src/rust/Cargo.toml | 6 +-- src/rust/src/lazy/dataframe.rs | 8 ++-- 3 files changed, 43 insertions(+), 43 deletions(-) diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index 4ad967bb4..c2ec10df6 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -1776,8 +1776,8 @@ dependencies = [ [[package]] name = "polars" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "getrandom", "polars-arrow", @@ -1796,8 +1796,8 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "ahash", "atoi", @@ -1843,8 +1843,8 @@ dependencies = [ [[package]] name = "polars-compute" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "bytemuck", "either", @@ -1858,8 +1858,8 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1892,8 +1892,8 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "avro-schema", "object_store", @@ -1905,8 +1905,8 @@ dependencies = [ [[package]] name = "polars-expr" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1924,8 +1924,8 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "ahash", "async-trait", @@ -1970,8 +1970,8 @@ dependencies = [ [[package]] name = "polars-json" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "ahash", "chrono", @@ -1990,8 +1990,8 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "ahash", "bitflags 2.4.2", @@ -2018,8 +2018,8 @@ dependencies = [ [[package]] name = "polars-mem-engine" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "futures", "polars-arrow", @@ -2038,8 +2038,8 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "ahash", "aho-corasick", @@ -2074,8 +2074,8 @@ dependencies = [ [[package]] name = "polars-parquet" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "ahash", "async-stream", @@ -2101,8 +2101,8 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -2128,8 +2128,8 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "ahash", "bytemuck", @@ -2158,8 +2158,8 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "bytemuck", "polars-arrow", @@ -2169,8 +2169,8 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "hex", "once_cell", @@ -2189,8 +2189,8 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "atoi", "bytemuck", @@ -2210,8 +2210,8 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.41.1" -source = "git+https://github.com/pola-rs/polars.git?rev=c0871ef8f8bcbe2108c25137604502f462549b87#c0871ef8f8bcbe2108c25137604502f462549b87" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" dependencies = [ "ahash", "bytemuck", diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index 30592eb67..aecc6696e 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -50,8 +50,8 @@ serde_json = "*" smartstring = "1.0.1" state = "0.6.0" thiserror = "1.0.61" -polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "c0871ef8f8bcbe2108c25137604502f462549b87", default-features = false } -polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "c0871ef8f8bcbe2108c25137604502f462549b87", default-features = false } +polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "bf2e201cca2aa2830e276880e0916cf2bbbcd7a8", default-features = false } +polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "bf2e201cca2aa2830e276880e0916cf2bbbcd7a8", default-features = false } either = "1" [dependencies.polars] @@ -159,4 +159,4 @@ features = [ "zip_with", ] git = "https://github.com/pola-rs/polars.git" -rev = "c0871ef8f8bcbe2108c25137604502f462549b87" +rev = "bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index c2feb00dc..642426ada 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -115,7 +115,7 @@ impl RPolarsLazyFrame { }; self.0 .clone() - .sink_parquet(robj_to!(String, path)?.into(), pqwo) + .sink_parquet(robj_to!(String, path)?, pqwo) .map_err(polars_to_rpolars_err) } @@ -126,7 +126,7 @@ impl RPolarsLazyFrame { }; self.0 .clone() - .sink_ipc(robj_to!(String, path)?.into(), ipcwo) + .sink_ipc(robj_to!(String, path)?, ipcwo) .map_err(polars_to_rpolars_err) } @@ -185,7 +185,7 @@ impl RPolarsLazyFrame { self.0 .clone() - .sink_csv(robj_to!(String, path)?.into(), options) + .sink_csv(robj_to!(String, path)?, options) .map_err(polars_to_rpolars_err) } @@ -194,7 +194,7 @@ impl RPolarsLazyFrame { let options = pl::JsonWriterOptions { maintain_order }; self.0 .clone() - .sink_json(robj_to!(String, path)?.into(), options) + .sink_json(robj_to!(String, path)?, options) .map_err(polars_to_rpolars_err) } From 87e95f3b5550410746a29e1d33ffbf619180e0fd Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Mon, 24 Jun 2024 22:23:16 +0200 Subject: [PATCH 31/47] fix vignette --- vignettes/userguide.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/userguide.Rmd b/vignettes/userguide.Rmd index fe988a4d9..1480f5211 100755 --- a/vignettes/userguide.Rmd +++ b/vignettes/userguide.Rmd @@ -407,7 +407,7 @@ df$select( ``` -```{r} +```{comment} filtered = df$ filter(pl$col("Type 2") == "Psychic")$ select(c("Name", "Type 1", "Speed")) From a97daa38383b5bf365f105a80f92e8a5af71b24c Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 26 Jun 2024 15:03:44 +0200 Subject: [PATCH 32/47] try to fix userguide --- vignettes/userguide.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/userguide.Rmd b/vignettes/userguide.Rmd index fe988a4d9..e5d12b646 100755 --- a/vignettes/userguide.Rmd +++ b/vignettes/userguide.Rmd @@ -449,7 +449,7 @@ df$sort("Type 1")$select( # List context and row wise computations -````{comment} +```{comment} grades = pl$DataFrame( "student" = c("bas", "laura", "tim", "jenny"), "arithmetic" = c(10, 5, 6, 8), @@ -477,7 +477,7 @@ grades$with_columns( # Custom functions -``` +```{comment} df = pl$DataFrame( "keys" = c("a", "a", "b"), "values" = c(10, 7, 1) From 18216f5bde14f54968b3743fa22de0684100f233 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 26 Jun 2024 15:26:25 +0200 Subject: [PATCH 33/47] news --- NEWS.md | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 091b1ae4b..c5cab2aaa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,10 +2,13 @@ ## Polars R Package (development version) +Updated rust-polars to 0.41.2 (#1147). + ### Breaking changes - In `$n_chunks()`, the default value of `strategy` now is `"first"` (#1137). --`$sample()` for Expr and DataFrame (#1136): + +- `$sample()` for Expr and DataFrame (#1136): - the argument `frac` is renamed `fraction`; - all the arguments except `n` must be named; - for the Expr method only, the first argument is now `n` (it was already the @@ -13,10 +16,55 @@ - for the Expr method only, the default value for `with_replacement` is now `FALSE` (it was already the case for the DataFrame method). +- `$melt()` had several changes (#1147): + - `melt()` is renamed `$unpivot()`. + - Some arguments were renamed: `id_vars` is now `index`, `value_vars` is now + `on`. + - The order of arguments has changed: `on` is now first, then `index`. The + order of the other arguments hasn't changed. Note that `on` can be unnamed + but all the other arguments must be named. + +- `pivot()` had several changes (#1147): + - The argument `columns` is renamed `on`. + - The order of arguments has changed: `on` is now first, then `index` and + `values`. The order of the other arguments hasn't changed. Note that `on` + can be unnamed but all the other arguments must be named. + +- In `$write_parquet()` and `$sink_parquet()`, the default value of argument + `statistics` is now `TRUE` and can take other values than `TRUE/FALSE` (#1147). + +- In `$dt$truncate()` and `$dt$round()`, the argument `offset` has been removed. + Use `$dt$offset_by()` after those functions instead (#1147). + +- In `$top_k()` and `$bottom_k()` for `Expr`, the arguments `nulls_last`, + `maintain_order` and `multithreaded` have been removed. If any `null` values + are in the top/bottom `k` values, they will always be positioned last (#1147). + +- `$replace()` has been split in two functions depending on the desired + behaviour (#1147): + - `$replace()` recodes some values in the column, leaving all other values + unchanged. Compared to the previous version, it doesn't the arguments + `default` and `return_dtype` anymore. + - `$replace_strict()` replaces all values by different values. If a value + doesn't have a specific mapping, it is replaced by the `default` value. + +- `$str$concat()` is deprecated, use `$str$join()` (with the same arguments) + instead (#1147). + +- In `pl$date_range()` and `pl$date_ranges()`, the arguments `time_unit` and + `time_zone` have been removed. They were deprecated in previous versions + (#1147). + + ### New features - New method `$has_nulls()` (#1133). - New method `$list$explode()` (#1139). +- `$over()` gains a new argument `order_by` to specify the order of values + within each group. This is useful when the operation depends on the order of + values, such as `$shift()` (#1147). +- `$value_counts()` gains an argument `normalize` to give relative frequencies + of unique values instead of their count (#1147). ## Polars R Package 0.17.0 From 4f6752c3be6997ebf378d490a479ed274b7d5a18 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 26 Jun 2024 15:28:59 +0200 Subject: [PATCH 34/47] typo --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index c5cab2aaa..4a33087c1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -43,7 +43,7 @@ Updated rust-polars to 0.41.2 (#1147). - `$replace()` has been split in two functions depending on the desired behaviour (#1147): - `$replace()` recodes some values in the column, leaving all other values - unchanged. Compared to the previous version, it doesn't the arguments + unchanged. Compared to the previous version, it doesn't use the arguments `default` and `return_dtype` anymore. - `$replace_strict()` replaces all values by different values. If a value doesn't have a specific mapping, it is replaced by the `default` value. From 2b03b354fef3dc39f080d11af4a47f2c58f8c6b5 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 26 Jun 2024 15:33:51 +0200 Subject: [PATCH 35/47] whitespace --- NEWS.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4a33087c1..7011d5b18 100644 --- a/NEWS.md +++ b/NEWS.md @@ -23,23 +23,23 @@ Updated rust-polars to 0.41.2 (#1147). - The order of arguments has changed: `on` is now first, then `index`. The order of the other arguments hasn't changed. Note that `on` can be unnamed but all the other arguments must be named. - + - `pivot()` had several changes (#1147): - The argument `columns` is renamed `on`. - - The order of arguments has changed: `on` is now first, then `index` and + - The order of arguments has changed: `on` is now first, then `index` and `values`. The order of the other arguments hasn't changed. Note that `on` can be unnamed but all the other arguments must be named. - + - In `$write_parquet()` and `$sink_parquet()`, the default value of argument `statistics` is now `TRUE` and can take other values than `TRUE/FALSE` (#1147). - + - In `$dt$truncate()` and `$dt$round()`, the argument `offset` has been removed. Use `$dt$offset_by()` after those functions instead (#1147). - + - In `$top_k()` and `$bottom_k()` for `Expr`, the arguments `nulls_last`, `maintain_order` and `multithreaded` have been removed. If any `null` values are in the top/bottom `k` values, they will always be positioned last (#1147). - + - `$replace()` has been split in two functions depending on the desired behaviour (#1147): - `$replace()` recodes some values in the column, leaving all other values @@ -47,10 +47,10 @@ Updated rust-polars to 0.41.2 (#1147). `default` and `return_dtype` anymore. - `$replace_strict()` replaces all values by different values. If a value doesn't have a specific mapping, it is replaced by the `default` value. - + - `$str$concat()` is deprecated, use `$str$join()` (with the same arguments) instead (#1147). - + - In `pl$date_range()` and `pl$date_ranges()`, the arguments `time_unit` and `time_zone` have been removed. They were deprecated in previous versions (#1147). From d5dff366c50e801a3fce9e7efcb98e825a0da748 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 26 Jun 2024 18:20:55 +0200 Subject: [PATCH 36/47] remove blank lines in news --- NEWS.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index 7011d5b18..40e5f3e65 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,7 +7,6 @@ Updated rust-polars to 0.41.2 (#1147). ### Breaking changes - In `$n_chunks()`, the default value of `strategy` now is `"first"` (#1137). - - `$sample()` for Expr and DataFrame (#1136): - the argument `frac` is renamed `fraction`; - all the arguments except `n` must be named; @@ -15,7 +14,6 @@ Updated rust-polars to 0.41.2 (#1147). case for the DataFrame method); - for the Expr method only, the default value for `with_replacement` is now `FALSE` (it was already the case for the DataFrame method). - - `$melt()` had several changes (#1147): - `melt()` is renamed `$unpivot()`. - Some arguments were renamed: `id_vars` is now `index`, `value_vars` is now @@ -23,23 +21,18 @@ Updated rust-polars to 0.41.2 (#1147). - The order of arguments has changed: `on` is now first, then `index`. The order of the other arguments hasn't changed. Note that `on` can be unnamed but all the other arguments must be named. - - `pivot()` had several changes (#1147): - The argument `columns` is renamed `on`. - The order of arguments has changed: `on` is now first, then `index` and `values`. The order of the other arguments hasn't changed. Note that `on` can be unnamed but all the other arguments must be named. - - In `$write_parquet()` and `$sink_parquet()`, the default value of argument `statistics` is now `TRUE` and can take other values than `TRUE/FALSE` (#1147). - - In `$dt$truncate()` and `$dt$round()`, the argument `offset` has been removed. Use `$dt$offset_by()` after those functions instead (#1147). - - In `$top_k()` and `$bottom_k()` for `Expr`, the arguments `nulls_last`, `maintain_order` and `multithreaded` have been removed. If any `null` values are in the top/bottom `k` values, they will always be positioned last (#1147). - - `$replace()` has been split in two functions depending on the desired behaviour (#1147): - `$replace()` recodes some values in the column, leaving all other values @@ -47,10 +40,8 @@ Updated rust-polars to 0.41.2 (#1147). `default` and `return_dtype` anymore. - `$replace_strict()` replaces all values by different values. If a value doesn't have a specific mapping, it is replaced by the `default` value. - - `$str$concat()` is deprecated, use `$str$join()` (with the same arguments) instead (#1147). - - In `pl$date_range()` and `pl$date_ranges()`, the arguments `time_unit` and `time_zone` have been removed. They were deprecated in previous versions (#1147). From 2bf141cf41ddb4b5927946b27bcc140c2f5c280d Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 30 Jun 2024 14:47:49 +0200 Subject: [PATCH 37/47] tests for arg 'statistics' --- R/dataframe__frame.R | 3 ++- R/lazyframe__lazy.R | 5 +++-- R/utils.R | 28 ++++++++++++++++++++++++---- tests/testthat/test-parquet.R | 22 ++++++++++++++++++++++ tests/testthat/test-sink_stream.R | 27 +++++++++++++++++++++++++++ 5 files changed, 78 insertions(+), 7 deletions(-) diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index ba42a7cae..7f7f67c2e 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -2035,7 +2035,8 @@ DataFrame_write_parquet = function( statistics = TRUE, row_group_size = NULL, data_pagesize_limit = NULL) { - statistics = translate_statistics(statistics) + statistics = translate_statistics(statistics)|> + unwrap("in $write_parquet():") .pr$DataFrame$write_parquet( self, file, diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index fef12c014..c0c486368 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -699,7 +699,8 @@ LazyFrame_sink_parquet = function( ) |> unwrap("in $sink_parquet()") } - statistics = translate_statistics(statistics) + statistics = translate_statistics(statistics) |> + unwrap("in $sink_parquet():") lf |> .pr$LazyFrame$sink_parquet( @@ -711,7 +712,7 @@ LazyFrame_sink_parquet = function( data_pagesize_limit, maintain_order ) |> - unwrap("in $sink_parquet()") + unwrap("in $sink_parquet():") invisible(self) } diff --git a/R/utils.R b/R/utils.R index 296ec69cb..f9019a408 100644 --- a/R/utils.R +++ b/R/utils.R @@ -697,13 +697,33 @@ translate_statistics = function(statistics) { null_count = FALSE ) } - } else if (is.character(statistics) && statistics == "full") { - statistics = list( + } else if (is.character(statistics)) { + if (statistics == "full") { + statistics = list( + min = TRUE, + max = TRUE, + distinct_count = TRUE, + null_count = TRUE + ) + } else { + return(Err_plain("`statistics` must be TRUE/FALSE, 'full', or a named list.")) + } + } else if (is.list(statistics)) { + default = list( min = TRUE, max = TRUE, - distinct_count = TRUE, + distinct_count = FALSE, null_count = TRUE ) + statistics = modifyList(default, statistics) + nms = names(statistics) + invalid = nms[! nms %in% c("min", "max", "distinct_count", "null_count")] + if (length(invalid) > 0) { + msg = paste0("`", invalid, "`", collapse = ", ") + return( + Err_plain("In `statistics`,", msg, "are not valid keys.") + ) + } } - statistics + result(statistics) } diff --git a/tests/testthat/test-parquet.R b/tests/testthat/test-parquet.R index 4aa8ce658..b74fbf5ed 100644 --- a/tests/testthat/test-parquet.R +++ b/tests/testthat/test-parquet.R @@ -94,3 +94,25 @@ test_that("write_parquet returns the input data", { x = dat$write_parquet(tmpf) expect_identical(x$to_list(), dat$to_list()) }) + +test_that("write_parquet: argument 'statistics'", { + dat = pl$DataFrame(mtcars) + tmpf = tempfile() + on.exit(unlink(tmpf)) + + expect_silent(dat$write_parquet(tmpf, statistics = TRUE)) + expect_silent(dat$write_parquet(tmpf, statistics = FALSE)) + expect_silent(dat$write_parquet(tmpf, statistics = "full")) + expect_grepl_error( + dat$write_parquet(tmpf, statistics = list(null_count = FALSE)), + "File out of specification: null count of a page is required" + ) + expect_grepl_error( + dat$write_parquet(tmpf, statistics = list(foo = TRUE, foo2 = FALSE)), + "In `statistics`, `foo`, `foo2` are not valid keys" + ) + expect_grepl_error( + dat$write_parquet(tmpf, statistics = "foo"), + "`statistics` must be TRUE/FALSE, 'full', or a named list." + ) +}) diff --git a/tests/testthat/test-sink_stream.R b/tests/testthat/test-sink_stream.R index 57010df61..57b470010 100644 --- a/tests/testthat/test-sink_stream.R +++ b/tests/testthat/test-sink_stream.R @@ -13,6 +13,33 @@ test_that("Test sinking data to parquet file", { expect_identical(x$collect()$to_list(), lf$collect()$to_list()) }) +test_that("sink_parquet: argument 'statistics'", { + tmpf = tempfile() + on.exit(unlink(tmpf)) + + expect_silent(lf$sink_parquet(tmpf, statistics = TRUE)) + expect_silent(lf$sink_parquet(tmpf, statistics = FALSE)) + expect_silent(lf$sink_parquet(tmpf, statistics = "full")) + # TODO: uncomment when https://github.com/pola-rs/polars/issues/17306 is fixed + # expect_silent(lf$sink_parquet( + # tmpf, + # statistics = list( + # min = TRUE, + # max = FALSE, + # distinct_count = TRUE, + # null_count = FALSE + # ) + # )) + expect_grepl_error( + lf$sink_parquet(tmpf, statistics = list(foo = TRUE, foo2 = FALSE)), + "In `statistics`, `foo`, `foo2` are not valid keys" + ) + expect_grepl_error( + lf$sink_parquet(tmpf, statistics = "foo"), + "`statistics` must be TRUE/FALSE, 'full', or a named list." + ) +}) + test_that("Test sinking data to IPC file", { tmpf = tempfile() on.exit(unlink(tmpf)) From b4befa8bb3da1d1396dcb5ce03bb2608a56d1f8c Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 30 Jun 2024 14:48:52 +0200 Subject: [PATCH 38/47] uncomment map_batches tests --- tests/testthat/test-dataframe.R | 38 ++++++++++++++++----------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index ef747bd17..46bba75a1 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -331,25 +331,25 @@ test_that("map_batches unity", { }) -# test_that("map_batches type", { -# int_iris = iris -# int_iris[] = lapply(iris, as.integer) -# -# # auto new type allowed if return is R vector -# x = pl$DataFrame(iris)$ -# select( -# pl$col("Sepal.Length")$ -# map_batches(\(s) { -# as.integer(s$to_r()) # ok to return R vector also, will be -# # converted back to series named "" -# })$ -# map_batches(\(s) s * 25L)$ -# map_batches(\(s) s / 4) -# )$ -# to_data_frame()[, 1, drop = FALSE] -# -# expect_identical(x, int_iris[, 1, drop = FALSE] * 25L / 4L) -# }) +test_that("map_batches type", { + int_iris = iris + int_iris[] = lapply(iris, as.integer) + + # auto new type allowed if return is R vector + x = pl$DataFrame(iris)$ + select( + pl$col("Sepal.Length")$ + map_batches(\(s) { + as.integer(s$to_r()) # ok to return R vector also, will be + # converted back to series named "" + })$ + map_batches(\(s) s * 25L)$ + map_batches(\(s) s / 4) + )$ + to_data_frame()[, 1, drop = FALSE] + + expect_identical(x, int_iris[, 1, drop = FALSE] * 25L / 4L) +}) test_that("cloning", { pf = pl$DataFrame(iris) From b8102607c0c9b1dc75d62f0c8b6710acfe7a7a39 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 30 Jun 2024 14:49:41 +0200 Subject: [PATCH 39/47] typo --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 40e5f3e65..76395253a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -33,7 +33,7 @@ Updated rust-polars to 0.41.2 (#1147). - In `$top_k()` and `$bottom_k()` for `Expr`, the arguments `nulls_last`, `maintain_order` and `multithreaded` have been removed. If any `null` values are in the top/bottom `k` values, they will always be positioned last (#1147). -- `$replace()` has been split in two functions depending on the desired +- `$replace()` has been split in two functions depending on the desired behaviour (#1147): - `$replace()` recodes some values in the column, leaving all other values unchanged. Compared to the previous version, it doesn't use the arguments From 64753537902d143762db670ad9e3a8aa82a89d00 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 30 Jun 2024 14:51:58 +0200 Subject: [PATCH 40/47] fix incorrect use of date_range in docs --- R/dataframe__frame.R | 2 +- R/group_by_dynamic.R | 2 +- R/utils.R | 4 ++-- man/DynamicGroupBy_class.Rd | 2 +- tests/testthat/test-parquet.R | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 7f7f67c2e..f6cb904e5 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -2035,7 +2035,7 @@ DataFrame_write_parquet = function( statistics = TRUE, row_group_size = NULL, data_pagesize_limit = NULL) { - statistics = translate_statistics(statistics)|> + statistics = translate_statistics(statistics) |> unwrap("in $write_parquet():") .pr$DataFrame$write_parquet( self, diff --git a/R/group_by_dynamic.R b/R/group_by_dynamic.R index 54cd85cef..67a65e399 100644 --- a/R/group_by_dynamic.R +++ b/R/group_by_dynamic.R @@ -5,7 +5,7 @@ #' @aliases RPolarsDynamicGroupBy #' @examples #' df = pl$DataFrame( -#' time = pl$date_range( +#' time = pl$datetime_range( #' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' interval = "30m" diff --git a/R/utils.R b/R/utils.R index f9019a408..d4daaee36 100644 --- a/R/utils.R +++ b/R/utils.R @@ -717,11 +717,11 @@ translate_statistics = function(statistics) { ) statistics = modifyList(default, statistics) nms = names(statistics) - invalid = nms[! nms %in% c("min", "max", "distinct_count", "null_count")] + invalid = nms[!nms %in% c("min", "max", "distinct_count", "null_count")] if (length(invalid) > 0) { msg = paste0("`", invalid, "`", collapse = ", ") return( - Err_plain("In `statistics`,", msg, "are not valid keys.") + Err_plain("In `statistics`,", msg, "are not valid keys.") ) } } diff --git a/man/DynamicGroupBy_class.Rd b/man/DynamicGroupBy_class.Rd index 66fbf6939..0c1f73a69 100644 --- a/man/DynamicGroupBy_class.Rd +++ b/man/DynamicGroupBy_class.Rd @@ -9,7 +9,7 @@ This class comes from \code{\link[=DataFrame_group_by_dynamic]{$group } \examples{ df = pl$DataFrame( - time = pl$date_range( + time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), interval = "30m" diff --git a/tests/testthat/test-parquet.R b/tests/testthat/test-parquet.R index b74fbf5ed..bc7063355 100644 --- a/tests/testthat/test-parquet.R +++ b/tests/testthat/test-parquet.R @@ -104,7 +104,7 @@ test_that("write_parquet: argument 'statistics'", { expect_silent(dat$write_parquet(tmpf, statistics = FALSE)) expect_silent(dat$write_parquet(tmpf, statistics = "full")) expect_grepl_error( - dat$write_parquet(tmpf, statistics = list(null_count = FALSE)), + dat$write_parquet(tmpf, statistics = list(null_count = FALSE)), "File out of specification: null count of a page is required" ) expect_grepl_error( From 2006a5a949efe97c05d3104e6ece8986b1aaf1ab Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 30 Jun 2024 15:35:24 +0200 Subject: [PATCH 41/47] fix docs --- R/expr__datetime.R | 46 ++++++++++++++----------------- R/expr__list.R | 2 +- R/group_by_dynamic.R | 2 +- R/lazyframe__lazy.R | 2 +- R/series__series.R | 2 +- man/DynamicGroupBy_ungroup.Rd | 2 +- man/ExprDT_cast_time_unit.Rd | 2 +- man/ExprDT_convert_time_zone.Rd | 4 +-- man/ExprDT_hour.Rd | 2 +- man/ExprDT_minute.Rd | 2 +- man/ExprDT_offset_by.Rd | 2 +- man/ExprDT_replace_time_zone.Rd | 4 +-- man/ExprDT_round.Rd | 6 ++-- man/ExprDT_time.Rd | 2 +- man/ExprDT_timestamp.Rd | 2 +- man/ExprDT_total_days.Rd | 4 +-- man/ExprDT_total_microseconds.Rd | 2 +- man/ExprDT_total_milliseconds.Rd | 2 +- man/ExprDT_total_nanoseconds.Rd | 2 +- man/ExprDT_total_seconds.Rd | 2 +- man/ExprDT_truncate.Rd | 6 ++-- man/ExprDT_with_time_unit.Rd | 2 +- man/ExprList_eval.Rd | 2 +- man/LazyFrame_group_by_dynamic.Rd | 2 +- man/Series_class.Rd | 2 +- 25 files changed, 50 insertions(+), 58 deletions(-) diff --git a/R/expr__datetime.R b/R/expr__datetime.R index 5bbbc733d..6d429c141 100644 --- a/R/expr__datetime.R +++ b/R/expr__datetime.R @@ -23,12 +23,10 @@ #' @examples #' t1 = as.POSIXct("3040-01-01", tz = "GMT") #' t2 = t1 + as.difftime(25, units = "secs") -#' s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms") +#' s = pl$datetime_range(t1, t2, interval = "2s", time_unit = "ms") #' -#' # use a dt namespace function #' df = pl$DataFrame(datetime = s)$with_columns( -#' pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s"), -#' pl$col("datetime")$dt$truncate("4s", offset("3s"))$alias("truncated_4s_offset_2s") +#' pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s") #' ) #' df ExprDT_truncate = function(every) { @@ -49,12 +47,10 @@ ExprDT_truncate = function(every) { #' @examples #' t1 = as.POSIXct("3040-01-01", tz = "GMT") #' t2 = t1 + as.difftime(25, units = "secs") -#' s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms") +#' s = pl$datetime_range(t1, t2, interval = "2s", time_unit = "ms") #' -#' # use a dt namespace function #' df = pl$DataFrame(datetime = s)$with_columns( -#' pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s"), -#' pl$col("datetime")$dt$truncate("4s", offset("3s"))$alias("truncated_4s_offset_2s") +#' pl$col("datetime")$dt$round("4s")$alias("rounded_4s") #' ) #' df ExprDT_round = function(every) { @@ -344,7 +340,7 @@ ExprDT_ordinal_day = function() { #' @aliases (Expr)$dt$hour #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( +#' date = pl$datetime_range( #' as.Date("2020-12-25"), #' as.Date("2021-1-05"), #' interval = "1d2h", @@ -369,7 +365,7 @@ ExprDT_hour = function() { #' @aliases (Expr)$dt$minute #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( +#' date = pl$datetime_range( #' as.Date("2020-12-25"), #' as.Date("2021-1-05"), #' interval = "1d5s", @@ -530,7 +526,7 @@ ExprDT_epoch = function(tu = c("us", "ns", "ms", "s", "d")) { #' @aliases (Expr)$dt$timestamp #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( +#' date = pl$datetime_range( #' start = as.Date("2001-1-1"), #' end = as.Date("2001-1-3"), #' interval = "1d1s" @@ -559,7 +555,7 @@ ExprDT_timestamp = function(tu = c("ns", "us", "ms")) { #' @aliases (Expr)$dt$with_time_unit #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( +#' date = pl$datetime_range( #' start = as.Date("2001-1-1"), #' end = as.Date("2001-1-3"), #' interval = "1d1s" @@ -589,7 +585,7 @@ ExprDT_with_time_unit = function(tu = c("ns", "us", "ms")) { #' @aliases (Expr)$dt$cast_time_unit #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( +#' date = pl$datetime_range( #' start = as.Date("2001-1-1"), #' end = as.Date("2001-1-3"), #' interval = "1d1s" @@ -615,10 +611,10 @@ ExprDT_cast_time_unit = function(tu = c("ns", "us", "ms")) { #' @return Expr of i64 #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( +#' date = pl$datetime_range( #' as.POSIXct("2020-03-01", tz = "UTC"), #' as.POSIXct("2020-05-01", tz = "UTC"), -#' "1mo" +#' "1mo1s" #' ) #' ) #' @@ -655,10 +651,10 @@ ExprDT_convert_time_zone = function(time_zone) { #' @aliases (Expr)$dt$replace_time_zone #' @examples #' df1 = pl$DataFrame( -#' london_timezone = pl$date_range( +#' london_timezone = pl$datetime_range( #' as.POSIXct("2020-03-01", tz = "UTC"), #' as.POSIXct("2020-07-01", tz = "UTC"), -#' "1mo" +#' "1mo1s" #' )$dt$convert_time_zone("Europe/London") #' ) #' @@ -703,10 +699,10 @@ ExprDT_replace_time_zone = function( #' @return Expr of i64 #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( +#' date = pl$datetime_range( #' start = as.Date("2020-3-1"), #' end = as.Date("2020-5-1"), -#' interval = "1mo" +#' interval = "1mo1s" #' ) #' ) #' df$select( @@ -765,7 +761,7 @@ ExprDT_total_minutes = function() { #' #' @return Expr of i64 #' @examples -#' df = pl$DataFrame(date = pl$date_range( +#' df = pl$DataFrame(date = pl$datetime_range( #' start = as.POSIXct("2020-1-1", tz = "GMT"), #' end = as.POSIXct("2020-1-1 00:04:00", tz = "GMT"), #' interval = "1m" @@ -784,7 +780,7 @@ ExprDT_total_seconds = function() { #' #' @return Expr of i64 #' @examples -#' df = pl$DataFrame(date = pl$date_range( +#' df = pl$DataFrame(date = pl$datetime_range( #' start = as.POSIXct("2020-1-1", tz = "GMT"), #' end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), #' interval = "1ms" @@ -803,7 +799,7 @@ ExprDT_total_milliseconds = function() { #' #' @return Expr of i64 #' @examples -#' df = pl$DataFrame(date = pl$date_range( +#' df = pl$DataFrame(date = pl$datetime_range( #' start = as.POSIXct("2020-1-1", tz = "GMT"), #' end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), #' interval = "1ms" @@ -822,7 +818,7 @@ ExprDT_total_microseconds = function() { #' #' @return Expr of i64 #' @examples -#' df = pl$DataFrame(date = pl$date_range( +#' df = pl$DataFrame(date = pl$datetime_range( #' start = as.POSIXct("2020-1-1", tz = "GMT"), #' end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), #' interval = "1ms" @@ -881,7 +877,7 @@ ExprDT_total_nanoseconds = function() { #' #' # the "by" argument also accepts expressions #' df = pl$DataFrame( -#' dates = pl$date_range( +#' dates = pl$datetime_range( #' as.POSIXct("2022-01-01", tz = "GMT"), #' as.POSIXct("2022-01-02", tz = "GMT"), #' interval = "6h", time_unit = "ms", time_zone = "GMT" @@ -906,7 +902,7 @@ ExprDT_offset_by = function(by) { #' #' #' @examples -#' df = pl$DataFrame(dates = pl$date_range( +#' df = pl$DataFrame(dates = pl$datetime_range( #' as.Date("2000-1-1"), #' as.Date("2000-1-2"), #' "1h" diff --git a/R/expr__list.R b/R/expr__list.R index b37f081da..c02d7e438 100644 --- a/R/expr__list.R +++ b/R/expr__list.R @@ -491,7 +491,7 @@ ExprList_to_struct = function( #' #' # concat strings in each list #' df$select( -#' pl$col("b")$list$eval(pl$element()$str$concat(" "))$list$first() +#' pl$col("b")$list$eval(pl$element()$str$join(" "))$list$first() #' ) ExprList_eval = function(expr, parallel = FALSE) { .pr$Expr$list_eval(self, expr, parallel) diff --git a/R/group_by_dynamic.R b/R/group_by_dynamic.R index 67a65e399..3a02e957a 100644 --- a/R/group_by_dynamic.R +++ b/R/group_by_dynamic.R @@ -109,7 +109,7 @@ DynamicGroupBy_agg = function(...) { #' @return [DataFrame][DataFrame_class] #' @examples #' df = pl$DataFrame( -#' time = pl$date_range( +#' time = pl$datetime_range( #' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' interval = "30m" diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index c0c486368..94d1ede21 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1999,7 +1999,7 @@ LazyFrame_rolling = function( #' - [`$rolling()`][LazyFrame_rolling] #' @examples #' lf = pl$LazyFrame( -#' time = pl$date_range( +#' time = pl$datetime_range( #' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' interval = "30m" diff --git a/R/series__series.R b/R/series__series.R index 6e8607357..20ad15e6d 100644 --- a/R/series__series.R +++ b/R/series__series.R @@ -110,7 +110,7 @@ #' #' # use Expr method in subnamespaces #' as_polars_series(list(3:1, 1:2, NULL))$list$first() -#' as_polars_series(c(1, NA, 2))$str$concat("-") +#' as_polars_series(c(1, NA, 2))$str$join("-") #' #' s = pl$date_range( #' as.Date("2024-02-18"), as.Date("2024-02-24"), diff --git a/man/DynamicGroupBy_ungroup.Rd b/man/DynamicGroupBy_ungroup.Rd index 3e042c876..f7c9663ca 100644 --- a/man/DynamicGroupBy_ungroup.Rd +++ b/man/DynamicGroupBy_ungroup.Rd @@ -15,7 +15,7 @@ Revert the \verb{$group_by_dynamic()} operation. Doing } \examples{ df = pl$DataFrame( - time = pl$date_range( + time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), interval = "30m" diff --git a/man/ExprDT_cast_time_unit.Rd b/man/ExprDT_cast_time_unit.Rd index 8532d0a9e..d423401d9 100644 --- a/man/ExprDT_cast_time_unit.Rd +++ b/man/ExprDT_cast_time_unit.Rd @@ -19,7 +19,7 @@ The corresponding global timepoint will stay unchanged +/- precision. } \examples{ df = pl$DataFrame( - date = pl$date_range( + date = pl$datetime_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), interval = "1d1s" diff --git a/man/ExprDT_convert_time_zone.Rd b/man/ExprDT_convert_time_zone.Rd index 3914e8690..7b914b3e0 100644 --- a/man/ExprDT_convert_time_zone.Rd +++ b/man/ExprDT_convert_time_zone.Rd @@ -19,10 +19,10 @@ regardless of your system’s time zone. } \examples{ df = pl$DataFrame( - date = pl$date_range( + date = pl$datetime_range( as.POSIXct("2020-03-01", tz = "UTC"), as.POSIXct("2020-05-01", tz = "UTC"), - "1mo" + "1mo1s" ) ) diff --git a/man/ExprDT_hour.Rd b/man/ExprDT_hour.Rd index f16784847..2c94e1232 100644 --- a/man/ExprDT_hour.Rd +++ b/man/ExprDT_hour.Rd @@ -17,7 +17,7 @@ Returns the hour number from 0 to 23. } \examples{ df = pl$DataFrame( - date = pl$date_range( + date = pl$datetime_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d2h", diff --git a/man/ExprDT_minute.Rd b/man/ExprDT_minute.Rd index 0d6ef7da8..e5a903c5e 100644 --- a/man/ExprDT_minute.Rd +++ b/man/ExprDT_minute.Rd @@ -17,7 +17,7 @@ Returns the minute number from 0 to 59. } \examples{ df = pl$DataFrame( - date = pl$date_range( + date = pl$datetime_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d5s", diff --git a/man/ExprDT_offset_by.Rd b/man/ExprDT_offset_by.Rd index a5e44a488..844f0db76 100644 --- a/man/ExprDT_offset_by.Rd +++ b/man/ExprDT_offset_by.Rd @@ -55,7 +55,7 @@ df$select( # the "by" argument also accepts expressions df = pl$DataFrame( - dates = pl$date_range( + dates = pl$datetime_range( as.POSIXct("2022-01-01", tz = "GMT"), as.POSIXct("2022-01-02", tz = "GMT"), interval = "6h", time_unit = "ms", time_zone = "GMT" diff --git a/man/ExprDT_replace_time_zone.Rd b/man/ExprDT_replace_time_zone.Rd index 449ac3fd4..6d0377c04 100644 --- a/man/ExprDT_replace_time_zone.Rd +++ b/man/ExprDT_replace_time_zone.Rd @@ -42,10 +42,10 @@ change the corresponding global timepoint. } \examples{ df1 = pl$DataFrame( - london_timezone = pl$date_range( + london_timezone = pl$datetime_range( as.POSIXct("2020-03-01", tz = "UTC"), as.POSIXct("2020-07-01", tz = "UTC"), - "1mo" + "1mo1s" )$dt$convert_time_zone("Europe/London") ) diff --git a/man/ExprDT_round.Rd b/man/ExprDT_round.Rd index be51b35af..601e133d3 100644 --- a/man/ExprDT_round.Rd +++ b/man/ExprDT_round.Rd @@ -43,12 +43,10 @@ These strings can be combined: \examples{ t1 = as.POSIXct("3040-01-01", tz = "GMT") t2 = t1 + as.difftime(25, units = "secs") -s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms") +s = pl$datetime_range(t1, t2, interval = "2s", time_unit = "ms") -# use a dt namespace function df = pl$DataFrame(datetime = s)$with_columns( - pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s"), - pl$col("datetime")$dt$truncate("4s", offset("3s"))$alias("truncated_4s_offset_2s") + pl$col("datetime")$dt$round("4s")$alias("rounded_4s") ) df } diff --git a/man/ExprDT_time.Rd b/man/ExprDT_time.Rd index 6aebb4ceb..8604e6a2a 100644 --- a/man/ExprDT_time.Rd +++ b/man/ExprDT_time.Rd @@ -13,7 +13,7 @@ A Time Expr This only works on Datetime Series, it will error on Date Series. } \examples{ -df = pl$DataFrame(dates = pl$date_range( +df = pl$DataFrame(dates = pl$datetime_range( as.Date("2000-1-1"), as.Date("2000-1-2"), "1h" diff --git a/man/ExprDT_timestamp.Rd b/man/ExprDT_timestamp.Rd index 6892e5aa7..74ddad68b 100644 --- a/man/ExprDT_timestamp.Rd +++ b/man/ExprDT_timestamp.Rd @@ -18,7 +18,7 @@ Return a timestamp in the given time unit. } \examples{ df = pl$DataFrame( - date = pl$date_range( + date = pl$datetime_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), interval = "1d1s" diff --git a/man/ExprDT_total_days.Rd b/man/ExprDT_total_days.Rd index b7dc388a0..f896d40ce 100644 --- a/man/ExprDT_total_days.Rd +++ b/man/ExprDT_total_days.Rd @@ -14,10 +14,10 @@ Extract the days from a Duration type. } \examples{ df = pl$DataFrame( - date = pl$date_range( + date = pl$datetime_range( start = as.Date("2020-3-1"), end = as.Date("2020-5-1"), - interval = "1mo" + interval = "1mo1s" ) ) df$select( diff --git a/man/ExprDT_total_microseconds.Rd b/man/ExprDT_total_microseconds.Rd index 0e69df198..2338c85d6 100644 --- a/man/ExprDT_total_microseconds.Rd +++ b/man/ExprDT_total_microseconds.Rd @@ -13,7 +13,7 @@ Expr of i64 Extract the microseconds from a Duration type. } \examples{ -df = pl$DataFrame(date = pl$date_range( +df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), interval = "1ms" diff --git a/man/ExprDT_total_milliseconds.Rd b/man/ExprDT_total_milliseconds.Rd index 9cf2b3f54..3919edd50 100644 --- a/man/ExprDT_total_milliseconds.Rd +++ b/man/ExprDT_total_milliseconds.Rd @@ -13,7 +13,7 @@ Expr of i64 Extract the milliseconds from a Duration type. } \examples{ -df = pl$DataFrame(date = pl$date_range( +df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), interval = "1ms" diff --git a/man/ExprDT_total_nanoseconds.Rd b/man/ExprDT_total_nanoseconds.Rd index dda7bfc41..e0489ece7 100644 --- a/man/ExprDT_total_nanoseconds.Rd +++ b/man/ExprDT_total_nanoseconds.Rd @@ -13,7 +13,7 @@ Expr of i64 Extract the nanoseconds from a Duration type. } \examples{ -df = pl$DataFrame(date = pl$date_range( +df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), interval = "1ms" diff --git a/man/ExprDT_total_seconds.Rd b/man/ExprDT_total_seconds.Rd index 9bdabdff5..31b33c68b 100644 --- a/man/ExprDT_total_seconds.Rd +++ b/man/ExprDT_total_seconds.Rd @@ -13,7 +13,7 @@ Expr of i64 Extract the seconds from a Duration type. } \examples{ -df = pl$DataFrame(date = pl$date_range( +df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:04:00", tz = "GMT"), interval = "1m" diff --git a/man/ExprDT_truncate.Rd b/man/ExprDT_truncate.Rd index e177bd962..a8a0140a8 100644 --- a/man/ExprDT_truncate.Rd +++ b/man/ExprDT_truncate.Rd @@ -40,12 +40,10 @@ These strings can be combined: \examples{ t1 = as.POSIXct("3040-01-01", tz = "GMT") t2 = t1 + as.difftime(25, units = "secs") -s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms") +s = pl$datetime_range(t1, t2, interval = "2s", time_unit = "ms") -# use a dt namespace function df = pl$DataFrame(datetime = s)$with_columns( - pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s"), - pl$col("datetime")$dt$truncate("4s", offset("3s"))$alias("truncated_4s_offset_2s") + pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s") ) df } diff --git a/man/ExprDT_with_time_unit.Rd b/man/ExprDT_with_time_unit.Rd index 65d85a261..dc6c5e9e7 100644 --- a/man/ExprDT_with_time_unit.Rd +++ b/man/ExprDT_with_time_unit.Rd @@ -20,7 +20,7 @@ The corresponding global timepoint will change. } \examples{ df = pl$DataFrame( - date = pl$date_range( + date = pl$datetime_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), interval = "1d1s" diff --git a/man/ExprList_eval.Rd b/man/ExprList_eval.Rd index 1161c5f86..6f04b96d5 100644 --- a/man/ExprList_eval.Rd +++ b/man/ExprList_eval.Rd @@ -47,6 +47,6 @@ df$select( # concat strings in each list df$select( - pl$col("b")$list$eval(pl$element()$str$concat(" "))$list$first() + pl$col("b")$list$eval(pl$element()$str$join(" "))$list$first() ) } diff --git a/man/LazyFrame_group_by_dynamic.Rd b/man/LazyFrame_group_by_dynamic.Rd index f0177d440..1fe869e29 100644 --- a/man/LazyFrame_group_by_dynamic.Rd +++ b/man/LazyFrame_group_by_dynamic.Rd @@ -95,7 +95,7 @@ by: } \examples{ lf = pl$LazyFrame( - time = pl$date_range( + time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), interval = "30m" diff --git a/man/Series_class.Rd b/man/Series_class.Rd index 105c8e03a..8907e8cb6 100644 --- a/man/Series_class.Rd +++ b/man/Series_class.Rd @@ -172,7 +172,7 @@ s$cos() # use Expr method in subnamespaces as_polars_series(list(3:1, 1:2, NULL))$list$first() -as_polars_series(c(1, NA, 2))$str$concat("-") +as_polars_series(c(1, NA, 2))$str$join("-") s = pl$date_range( as.Date("2024-02-18"), as.Date("2024-02-24"), From f74f3c3ccddaec1fa53237d1322b474600cad8a3 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 30 Jun 2024 15:49:44 +0200 Subject: [PATCH 42/47] add utils:: --- R/utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index d4daaee36..060e47108 100644 --- a/R/utils.R +++ b/R/utils.R @@ -715,7 +715,7 @@ translate_statistics = function(statistics) { distinct_count = FALSE, null_count = TRUE ) - statistics = modifyList(default, statistics) + statistics = utils::modifyList(default, statistics) nms = names(statistics) invalid = nms[!nms %in% c("min", "max", "distinct_count", "null_count")] if (length(invalid) > 0) { From 403443bdfb057cf53733f3419a4d78808d90023e Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Mon, 1 Jul 2024 13:24:46 +0200 Subject: [PATCH 43/47] check length of statistics --- R/utils.R | 3 +++ tests/testthat/test-parquet.R | 4 ++++ tests/testthat/test-sink_stream.R | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/R/utils.R b/R/utils.R index 060e47108..fe70bff6e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -681,6 +681,9 @@ is_named = function(x) { # Used in parquet write/sink translate_statistics = function(statistics) { + if (length(statistics) != 1) { + return(Err_plain("`statistics` must be of length 1.")) + } if (is.logical(statistics)) { if (isTRUE(statistics)) { statistics = list( diff --git a/tests/testthat/test-parquet.R b/tests/testthat/test-parquet.R index bc7063355..26b2869ff 100644 --- a/tests/testthat/test-parquet.R +++ b/tests/testthat/test-parquet.R @@ -115,4 +115,8 @@ test_that("write_parquet: argument 'statistics'", { dat$write_parquet(tmpf, statistics = "foo"), "`statistics` must be TRUE/FALSE, 'full', or a named list." ) + expect_grepl_error( + dat$write_parquet(tmpf, statistics = c(max = TRUE, min = FALSE)), + "`statistics` must be of length 1." + ) }) diff --git a/tests/testthat/test-sink_stream.R b/tests/testthat/test-sink_stream.R index 57b470010..5d676c6df 100644 --- a/tests/testthat/test-sink_stream.R +++ b/tests/testthat/test-sink_stream.R @@ -38,6 +38,10 @@ test_that("sink_parquet: argument 'statistics'", { lf$sink_parquet(tmpf, statistics = "foo"), "`statistics` must be TRUE/FALSE, 'full', or a named list." ) + expect_grepl_error( + lf$sink_parquet(tmpf, statistics = c(max = TRUE, min = FALSE)), + "`statistics` must be of length 1." + ) }) test_that("Test sinking data to IPC file", { From 4ad895093f0bffa2366f89f991e1404845d2d7ed Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Mon, 1 Jul 2024 13:26:44 +0200 Subject: [PATCH 44/47] use ".r" instead of "comment" as chunk engine --- vignettes/userguide.Rmd | 154 ++++++++++++++++++++-------------------- 1 file changed, 77 insertions(+), 77 deletions(-) diff --git a/vignettes/userguide.Rmd b/vignettes/userguide.Rmd index 3537305ae..e2663d4b8 100755 --- a/vignettes/userguide.Rmd +++ b/vignettes/userguide.Rmd @@ -16,7 +16,7 @@ options(rmarkdown.html_vignette.check_title = FALSE) ``` -````{comment} +```{.r} These functions/methods are either missing, broken, or Vincent can't figure out how to use them. * `Series_shift` @@ -33,7 +33,7 @@ Requires new Polars version: * `df$sample()` * `df$describe()` -```` +``` [The Polars User Guide](https://pola-rs.github.io/polars-book/user-guide/) is a detailed tutorial about the Polars DataFrame library. Its goal is to introduce you to Polars by going through examples and comparing it to other solutions. Some design choices are introduced there. The guide also introduces you to optimal usage of Polars. The Polars User Guide is available at this link: @@ -271,82 +271,82 @@ dataset$ collect() ``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +```{.r} +compute_age = function() 2021 - pl$col("birthday")$dt$year() + +avg_birthday = function(gender) { + compute_age()$filter(pl$col("gender") == gender)$mean()$alias(sprintf("avg %s birthday", gender)) +} + +q = ( + dataset$lazy()$ + group_by("state")$ + agg( + avg_birthday("M"), + avg_birthday("F"), + (pl$col("gender") == "M")$sum()$alias("# male"), + (pl$col("gender") == "F")$sum()$alias("# female") + )$ + limit(5) +) +q$collect() + +# +# get_person <- function() pl$col("first_name") + pl$lit(" ") + pl$col("last_name") +# q = ( +# dataset$lazy() +# $sort("birthday", descending=True) +# $group_by(["state"]) +# $agg( +# [ +# get_person()$first()$alias("youngest"), +# get_person()$last()$alias("oldest"), +# ] +# ) +# $limit(5) +# ) +# q$collect() +# +# get_person <- function() pl$col("first_name") + pl$lit(" ") + pl$col("last_name") +# q = ( +# dataset$lazy() +# $sort("birthday", descending=True) +# $group_by(["state"]) +# $agg( +# [ +# get_person()$first()$alias("youngest"), +# get_person()$last()$alias("oldest"), +# get_person()$sort()$first()$alias("alphabetical_first"), +# ] +# ) +# $limit(5) +# ) +# q$collect() +# +# q = ( +# dataset$lazy() +# $sort("birthday", descending=True) +# $group_by(["state"]) +# $agg( +# [ +# get_person()$first()$alias("youngest"), +# get_person()$last()$alias("oldest"), +# get_person()$sort()$first()$alias("alphabetical_first"), +# pl$col("gender")$sort_by("first_name")$first()$alias("gender"), +# ] +# ) +# $sort("state") +# $limit(5) +# ) +# q$collect() +``` ## Folds -```{comment} +```{.r} df = pl$DataFrame( "a" = c(1, 2, 3), "b" = c(10, 20, 30) @@ -396,7 +396,7 @@ df = pl$read_csv( ) ``` -```{comment} +```{.r} df$select( "Type 1", "Type 2", @@ -407,7 +407,7 @@ df$select( ``` -```{comment} +```{.r} filtered = df$ filter(pl$col("Type 2") == "Psychic")$ select(c("Name", "Type 1", "Speed")) @@ -449,7 +449,7 @@ df$sort("Type 1")$select( # List context and row wise computations -```{comment} +```{.r} grades = pl$DataFrame( "student" = c("bas", "laura", "tim", "jenny"), "arithmetic" = c(10, 5, 6, 8), @@ -477,7 +477,7 @@ grades$with_columns( # Custom functions -```{comment} +```{.r} df = pl$DataFrame( "keys" = c("a", "a", "b"), "values" = c(10, 7, 1) @@ -621,7 +621,7 @@ df$group_by("fruits")$ ``` -```{comment} +```{.r} # We can explode the list column "cars" to a new row for each element in the list df$ # sort("cars")$ From 6166d273bbd05b826bee4c886637273742bdd204 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Mon, 1 Jul 2024 23:11:35 +0200 Subject: [PATCH 45/47] bump to unreleased version --- R/utils.R | 2 +- src/rust/Cargo.lock | 44 +++++++++++-------------- src/rust/Cargo.toml | 6 ++-- src/rust/src/lazy/dataframe.rs | 1 + src/rust/src/rdataframe/read_parquet.rs | 1 + 5 files changed, 25 insertions(+), 29 deletions(-) diff --git a/R/utils.R b/R/utils.R index fe70bff6e..46141eef7 100644 --- a/R/utils.R +++ b/R/utils.R @@ -681,7 +681,7 @@ is_named = function(x) { # Used in parquet write/sink translate_statistics = function(statistics) { - if (length(statistics) != 1) { + if (length(statistics) != 1 && !is.list(statistics)) { return(Err_plain("`statistics` must be of length 1.")) } if (is.logical(statistics)) { diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index 9669218a1..5d8db30e8 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -1777,7 +1777,7 @@ dependencies = [ [[package]] name = "polars" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "getrandom", "polars-arrow", @@ -1797,7 +1797,7 @@ dependencies = [ [[package]] name = "polars-arrow" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "atoi", @@ -1844,7 +1844,7 @@ dependencies = [ [[package]] name = "polars-compute" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "bytemuck", "either", @@ -1859,7 +1859,7 @@ dependencies = [ [[package]] name = "polars-core" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1893,7 +1893,7 @@ dependencies = [ [[package]] name = "polars-error" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "avro-schema", "object_store", @@ -1906,7 +1906,7 @@ dependencies = [ [[package]] name = "polars-expr" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1925,7 +1925,7 @@ dependencies = [ [[package]] name = "polars-io" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "async-trait", @@ -1971,10 +1971,11 @@ dependencies = [ [[package]] name = "polars-json" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "chrono", + "chrono-tz", "fallible-streaming-iterator", "hashbrown 0.14.3", "indexmap", @@ -1991,7 +1992,7 @@ dependencies = [ [[package]] name = "polars-lazy" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "bitflags 2.4.2", @@ -2019,7 +2020,7 @@ dependencies = [ [[package]] name = "polars-mem-engine" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "futures", "polars-arrow", @@ -2039,7 +2040,7 @@ dependencies = [ [[package]] name = "polars-ops" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "aho-corasick", @@ -2075,7 +2076,7 @@ dependencies = [ [[package]] name = "polars-parquet" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "async-stream", @@ -2091,7 +2092,6 @@ dependencies = [ "polars-compute", "polars-error", "polars-utils", - "seq-macro", "serde", "simdutf8", "snap", @@ -2102,7 +2102,7 @@ dependencies = [ [[package]] name = "polars-pipe" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -2129,7 +2129,7 @@ dependencies = [ [[package]] name = "polars-plan" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "bytemuck", @@ -2159,7 +2159,7 @@ dependencies = [ [[package]] name = "polars-row" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "bytemuck", "polars-arrow", @@ -2170,7 +2170,7 @@ dependencies = [ [[package]] name = "polars-sql" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "hex", "once_cell", @@ -2190,7 +2190,7 @@ dependencies = [ [[package]] name = "polars-time" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "atoi", "bytemuck", @@ -2211,7 +2211,7 @@ dependencies = [ [[package]] name = "polars-utils" version = "0.41.2" -source = "git+https://github.com/pola-rs/polars.git?rev=bf2e201cca2aa2830e276880e0916cf2bbbcd7a8#bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "bytemuck", @@ -2755,12 +2755,6 @@ dependencies = [ "libc", ] -[[package]] -name = "seq-macro" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" - [[package]] name = "serde" version = "1.0.203" diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index aecc6696e..9bfabf136 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -50,8 +50,8 @@ serde_json = "*" smartstring = "1.0.1" state = "0.6.0" thiserror = "1.0.61" -polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "bf2e201cca2aa2830e276880e0916cf2bbbcd7a8", default-features = false } -polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "bf2e201cca2aa2830e276880e0916cf2bbbcd7a8", default-features = false } +polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "f73937ab5213a44eaaba8cfc799d8f837600f179", default-features = false } +polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "f73937ab5213a44eaaba8cfc799d8f837600f179", default-features = false } either = "1" [dependencies.polars] @@ -159,4 +159,4 @@ features = [ "zip_with", ] git = "https://github.com/pola-rs/polars.git" -rev = "bf2e201cca2aa2830e276880e0916cf2bbbcd7a8" +rev = "f73937ab5213a44eaaba8cfc799d8f837600f179" diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index 642426ada..41f21363e 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -167,6 +167,7 @@ impl RPolarsLazyFrame { date_format, time_format, datetime_format, + float_scientific: None, float_precision, separator, quote_char: quote, diff --git a/src/rust/src/rdataframe/read_parquet.rs b/src/rust/src/rdataframe/read_parquet.rs index 339f3b5b2..af7bfc662 100644 --- a/src/rust/src/rdataframe/read_parquet.rs +++ b/src/rust/src/rdataframe/read_parquet.rs @@ -45,6 +45,7 @@ pub fn new_from_parquet( enabled: robj_to!(Option, bool, hive_partitioning)?, hive_start_idx: 0, // TODO: is it actually 0? schema: None, // TODO: implement a option to set this + try_parse_dates: true, }, glob: robj_to!(bool, glob)?, }; From 74a40b5b81035a6f8eb4a74db4e7622ff107fddf Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Mon, 1 Jul 2024 23:21:38 +0200 Subject: [PATCH 46/47] fix for cross joins --- NEWS.md | 2 ++ R/lazyframe__lazy.R | 26 +++++++++++++++++--------- tests/testthat/test-joins.R | 10 ++++++++++ 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index 76395253a..203b4041a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -45,6 +45,8 @@ Updated rust-polars to 0.41.2 (#1147). - In `pl$date_range()` and `pl$date_ranges()`, the arguments `time_unit` and `time_zone` have been removed. They were deprecated in previous versions (#1147). +- In `$join()`, when `how = "cross"`, `on`, `left_on` and `right_on` must be + `NULL` (#1147). ### New features diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 94d1ede21..90d58295b 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1336,16 +1336,24 @@ LazyFrame_join = function( Err_plain("`other` must be a LazyFrame.") |> uw() } - if (!is.null(on)) { - rexprs_right = rexprs_left = as.list(on) - } else if ((!is.null(left_on) && !is.null(right_on))) { - rexprs_left = as.list(left_on) - rexprs_right = as.list(right_on) - } else if (how != "cross") { - Err_plain("must specify either `on`, or `left_on` and `right_on`.") |> uw() + if (how == "cross") { + if (!is.null(on) || !is.null(left_on) || !is.null(right_on)) { + Err_plain("cross join should not pass join keys.") |> uw() + } + rexprs_left = as.list(NULL) + rexprs_right = as.list(NULL) } else { - rexprs_left = as.list(self$columns) - rexprs_right = as.list(other$columns) + if (!is.null(on)) { + rexprs_right = rexprs_left = as.list(on) + } else if ((!is.null(left_on) && !is.null(right_on))) { + rexprs_left = as.list(left_on) + rexprs_right = as.list(right_on) + } else if (how != "cross") { + Err_plain("must specify either `on`, or `left_on` and `right_on`.") |> uw() + } else { + rexprs_left = as.list(self$columns) + rexprs_right = as.list(other$columns) + } } .pr$LazyFrame$join( diff --git a/tests/testthat/test-joins.R b/tests/testthat/test-joins.R index 8e3953468..8e2e1c5d3 100644 --- a/tests/testthat/test-joins.R +++ b/tests/testthat/test-joins.R @@ -111,6 +111,16 @@ test_that("cross join, DataFrame", { ) ) + expect_grepl_error( + dat$join(dat2, how = "cross", on = "foo"), + "cross join should not pass join keys" + ) + + expect_grepl_error( + dat$join(dat2, how = "cross", left_on = "foo", right_on = "foo2"), + "cross join should not pass join keys" + ) + # one empty dataframe dat_empty = pl$DataFrame(y = character()) expect_identical( From 99d330f4baf51a5a910ee47787a9fba3c86837fb Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Mon, 1 Jul 2024 23:27:02 +0200 Subject: [PATCH 47/47] other fixes for joins --- R/dataframe__frame.R | 2 +- R/lazyframe__lazy.R | 7 ++----- man/DataFrame_join.Rd | 2 +- man/LazyFrame_join.Rd | 2 +- tests/testthat/test-joins.R | 12 ++++++------ 5 files changed, 11 insertions(+), 14 deletions(-) diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index f6cb904e5..57b7c71a9 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1070,7 +1070,7 @@ DataFrame_to_list = function(unnest_structs = TRUE, ..., int64_conversion = pola DataFrame_join = function( other, on = NULL, - how = c("inner", "left", "full", "semi", "anti", "cross"), + how = "inner", ..., left_on = NULL, right_on = NULL, diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 90d58295b..39fe36eca 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1320,7 +1320,7 @@ LazyFrame_group_by = function(..., maintain_order = polars_options()$maintain_or LazyFrame_join = function( other, on = NULL, - how = c("inner", "left", "full", "semi", "anti", "cross"), + how = "inner", ..., left_on = NULL, right_on = NULL, @@ -1348,11 +1348,8 @@ LazyFrame_join = function( } else if ((!is.null(left_on) && !is.null(right_on))) { rexprs_left = as.list(left_on) rexprs_right = as.list(right_on) - } else if (how != "cross") { - Err_plain("must specify either `on`, or `left_on` and `right_on`.") |> uw() } else { - rexprs_left = as.list(self$columns) - rexprs_right = as.list(other$columns) + Err_plain("must specify either `on`, or `left_on` and `right_on`.") |> uw() } } diff --git a/man/DataFrame_join.Rd b/man/DataFrame_join.Rd index d84357cba..1a8519583 100644 --- a/man/DataFrame_join.Rd +++ b/man/DataFrame_join.Rd @@ -7,7 +7,7 @@ DataFrame_join( other, on = NULL, - how = c("inner", "left", "full", "semi", "anti", "cross"), + how = "inner", ..., left_on = NULL, right_on = NULL, diff --git a/man/LazyFrame_join.Rd b/man/LazyFrame_join.Rd index 382cbff76..dc6b2bb57 100644 --- a/man/LazyFrame_join.Rd +++ b/man/LazyFrame_join.Rd @@ -7,7 +7,7 @@ LazyFrame_join( other, on = NULL, - how = c("inner", "left", "full", "semi", "anti", "cross"), + how = "inner", ..., left_on = NULL, right_on = NULL, diff --git a/tests/testthat/test-joins.R b/tests/testthat/test-joins.R index 8e2e1c5d3..c1f8cc9cf 100644 --- a/tests/testthat/test-joins.R +++ b/tests/testthat/test-joins.R @@ -156,37 +156,37 @@ test_that("argument 'validate' works", { # eager 1:1 expect_grepl_error( df1$join(df2, on = "x", validate = "1:1"), - "join keys did not fulfil 1:1 validation" + "join keys did not fulfill 1:1 validation" ) # lazy 1:1 expect_grepl_error( df1$lazy()$join(df2$lazy(), on = "x", validate = "1:1")$collect(), - "join keys did not fulfil 1:1 validation" + "join keys did not fulfill 1:1 validation" ) # eager m:1 expect_grepl_error( df1$join(df2, on = "x", validate = "m:1"), - "join keys did not fulfil m:1 validation" + "join keys did not fulfill m:1 validation" ) # lazy m:1 expect_grepl_error( df1$lazy()$join(df2$lazy(), on = "x", validate = "m:1")$collect(), - "join keys did not fulfil m:1 validation" + "join keys did not fulfill m:1 validation" ) # eager 1:m expect_grepl_error( df2$join(df1, on = "x", validate = "1:m"), - "join keys did not fulfil 1:m validation" + "join keys did not fulfill 1:m validation" ) # lazy 1:m expect_grepl_error( df2$lazy()$join(df1$lazy(), on = "x", validate = "1:m")$collect(), - "join keys did not fulfil 1:m validation" + "join keys did not fulfill 1:m validation" ) # eager error on unknown validate choice