pola-rs · etiennebacher · Sep 23, 2024 · Sep 22, 2024 · Sep 22, 2024 · Sep 23, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -17,6 +17,7 @@
 - New argument `strict` in `$drop()` to determine whether unknown column names
   should trigger an error (#1220).
 - New method `$to_dummies()` for `DataFrame` (#1225).
+- New argument `include_file_paths` in `pl_scan_csv()` and `pl_read_csv()` (#1235).
 
 ### Bug fixes
 

diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R
@@ -92,7 +92,7 @@ concat_df_horizontal <- function(l) .Call(wrap__concat_df_horizontal, l)
 
 concat_series <- function(l, rechunk, to_supertypes) .Call(wrap__concat_series, l, rechunk, to_supertypes)
 
-new_from_csv <- function(path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines) .Call(wrap__new_from_csv, path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines)
+new_from_csv <- function(path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, include_file_paths) .Call(wrap__new_from_csv, path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, include_file_paths)
 
 import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_index, hive_partitioning, hive_schema, try_parse_hive_dates, include_file_paths) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_index, hive_partitioning, hive_schema, try_parse_hive_dates, include_file_paths)
 

diff --git a/R/io_csv.R b/R/io_csv.R
@@ -65,6 +65,8 @@
 #' @param truncate_ragged_lines Truncate lines that are longer than the schema.
 #' @param reuse_downloaded If `TRUE`(default) and a URL was provided, cache the
 #' downloaded files in session for an easy reuse.
+#' @param include_file_paths Include the path of the source file(s) as a column
+#' with this name.
 #' @return [LazyFrame][LazyFrame_class]
 #' @examples
 #' my_file = tempfile()
@@ -97,7 +99,8 @@ pl_scan_csv = function(
     eol_char = "\n",
     raise_if_empty = TRUE,
     truncate_ragged_lines = FALSE,
-    reuse_downloaded = TRUE) {
+    reuse_downloaded = TRUE,
+    include_file_paths = NULL) {
   # capture all args and modify some to match lower level function
   args = as.list(environment())
 
@@ -181,7 +184,8 @@ pl_read_csv = function(
     eol_char = "\n",
     raise_if_empty = TRUE,
     truncate_ragged_lines = FALSE,
-    reuse_downloaded = TRUE) {
+    reuse_downloaded = TRUE,
+    include_file_paths = NULL) {
   .args = as.list(environment())
   result({
     do.call(pl$scan_csv, .args)$collect()

diff --git a/man/IO_read_csv.Rd b/man/IO_read_csv.Rd
diff --git a/man/IO_read_parquet.Rd b/man/IO_read_parquet.Rd
diff --git a/man/IO_scan_csv.Rd b/man/IO_scan_csv.Rd
diff --git a/man/IO_scan_parquet.Rd b/man/IO_scan_parquet.Rd
diff --git a/src/rust/src/rdataframe/read_csv.rs b/src/rust/src/rdataframe/read_csv.rs
@@ -72,6 +72,7 @@ pub fn new_from_csv(
     eol_char: Robj,
     raise_if_empty: Robj,
     truncate_ragged_lines: Robj,
+    include_file_paths: Robj,
 ) -> RResult<RPolarsLazyFrame> {
     let offset = robj_to!(Option, u32, row_index_offset)?.unwrap_or(0);
     let opt_rowcount = robj_to!(Option, String, row_index_name)?.map(|name| RowIndex {
@@ -126,6 +127,7 @@ pub fn new_from_csv(
         // .with_missing_is_null(!robj_to!(bool, missing_utf8_is_empty_string)?)
         .with_row_index(opt_rowcount)
         .with_truncate_ragged_lines(robj_to!(bool, truncate_ragged_lines)?)
+        .with_include_file_paths(robj_to!(Option, String, include_file_paths)?.map(|x| x.into()))
         .with_raise_if_empty(robj_to!(bool, raise_if_empty)?)
         .finish()
         .map_err(polars_to_rpolars_err)

diff --git a/tests/testthat/test-csv-read.R b/tests/testthat/test-csv-read.R
@@ -197,3 +197,17 @@ test_that("cache url tempfile", {
   expect_false(is.null(cache_temp_file[[url]]))
   expect_equal(attempt_1, attempt_2)
 })
+
+test_that("scan_csv can include file path", {
+  skip_if_not_installed("withr")
+  temp_file_1 = withr::local_tempfile()
+  temp_file_2 = withr::local_tempfile()
+  pl$DataFrame(mtcars)$write_csv(temp_file_1)
+  pl$DataFrame(mtcars)$write_csv(temp_file_2)
+
+  expect_identical(
+    pl$scan_csv(c(temp_file_1, temp_file_2), include_file_paths = "file_paths")$collect()$unique("file_paths") |>
+      dim(),
+    c(2L, 12L)
+  )
+})