From 3be20ad3739ca534eac197a571b752146de1cb90 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Wed, 7 Feb 2024 16:07:22 +0100
Subject: [PATCH] `data_tabulate()` gains a `weights` argument (#479)

* `data_tabulate()` gains a `weights` argument

* add tests

* tests

* lintr

* news, comment

* typo

* fix test, remove whitespace
---
 DESCRIPTION                            |  2 +-
 NEWS.md                                |  2 +
 R/data_modify.R                        |  2 +-
 R/data_tabulate.R                      | 83 +++++++++++++++++++++++---
 man/data_tabulate.Rd                   | 18 +++++-
 tests/testthat/_snaps/data_tabulate.md | 79 ++++++++++++++++++++++++
 tests/testthat/test-data_tabulate.R    | 33 +++++++++-
 7 files changed, 205 insertions(+), 14 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 41a1744fd..2d17aff8e 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.9.1.1
+Version: 0.9.1.2
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
diff --git a/NEWS.md b/NEWS.md
index 25d043259..ffb41206d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -5,6 +5,8 @@ CHANGES
 * `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify
   variables at specific positions or based on logical conditions.
 
+* `data_tabulate()` gets a `weights` argument, to compute weighted frequency tables.
+
 # datawizard 0.9.1
 
 CHANGES
diff --git a/R/data_modify.R b/R/data_modify.R
index 063ecc325..30b21e460 100644
--- a/R/data_modify.R
+++ b/R/data_modify.R
@@ -122,7 +122,7 @@
 #'   .at = c("Species", "new_length"),
 #'   .modify = as.numeric
 #' )}
-#' 
+#'
 #' # combine "data_find()" and ".at" argument
 #' out <- data_modify(
 #'   d,
diff --git a/R/data_tabulate.R b/R/data_tabulate.R
index 7a56e6a87..4d3bb810b 100644
--- a/R/data_tabulate.R
+++ b/R/data_tabulate.R
@@ -13,6 +13,8 @@
 #' for printing.
 #' @param collapse Logical, if `TRUE` collapses multiple tables into one larger
 #' table for printing. This affects only printing, not the returned object.
+#' @param weights Optional numeric vector of weights. Must be of the same length
+#' as `x`. If `weights` is supplied, weighted frequencies are calculated.
 #' @param ... not used.
 #' @inheritParams find_columns
 #'
@@ -46,6 +48,12 @@
 #'
 #' # to remove the big mark, use "print(..., big_mark = "")"
 #' print(data_tabulate(x), big_mark = "")
+#'
+#' # weighted frequencies
+#' set.seed(123)
+#' efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+#' data_tabulate(efc$e42dep, weights = efc$weights)
+#'
 #' @export
 data_tabulate <- function(x, ...) {
   UseMethod("data_tabulate")
@@ -54,7 +62,7 @@ data_tabulate <- function(x, ...) {
 
 #' @rdname data_tabulate
 #' @export
-data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose = TRUE, ...) {
+data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name = NULL, verbose = TRUE, ...) {
   # save label attribute, before it gets lost...
   var_label <- attr(x, "label", exact = TRUE)
 
@@ -70,8 +78,26 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose =
     x <- droplevels(x)
   }
 
+  # check for correct length of weights - must be equal to "x"
+  if (!is.null(weights) && length(weights) != length(x)) {
+    insight::format_error("Length of `weights` must be equal to length of `x`.")
+  }
+
   # frequency table
-  freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
+  if (is.null(weights)) {
+    freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
+  } else {
+    # weighted frequency table
+    freq_table <- tryCatch(
+      stats::xtabs(
+        weights ~ x,
+        data = data.frame(weights = weights, x = x),
+        na.action = stats::na.pass,
+        addNA = TRUE
+      ),
+      error = function(e) NULL
+    )
+  }
 
   if (is.null(freq_table)) {
     insight::format_warning(paste0("Can't compute frequency tables for objects of class `", class(x)[1], "`."))
@@ -83,6 +109,11 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose =
     replacement = c("Value", "N")
   )
 
+  # we want to round N for weighted frequencies
+  if (!is.null(weights)) {
+    out$N <- round(out$N)
+  }
+
   out$`Raw %` <- 100 * out$N / sum(out$N)
   out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA)
   out$`Cumulative %` <- cumsum(out$`Valid %`)
@@ -110,6 +141,7 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose =
   attr(out, "object") <- obj_name
   attr(out, "group_variable") <- group_variable
   attr(out, "duplicate_varnames") <- duplicated(out$Variable)
+  attr(out, "weights") <- weights
 
   attr(out, "total_n") <- sum(out$N, na.rm = TRUE)
   attr(out, "valid_n") <- sum(out$N[-length(out$N)], na.rm = TRUE)
@@ -129,6 +161,7 @@ data_tabulate.data.frame <- function(x,
                                      regex = FALSE,
                                      collapse = FALSE,
                                      drop_levels = FALSE,
+                                     weights = NULL,
                                      verbose = TRUE,
                                      ...) {
   # evaluate arguments
@@ -140,11 +173,12 @@ data_tabulate.data.frame <- function(x,
     verbose = verbose
   )
   out <- lapply(select, function(i) {
-    data_tabulate(x[[i]], drop_levels = drop_levels, name = i, verbose = verbose, ...)
+    data_tabulate(x[[i]], drop_levels = drop_levels, weights = weights, name = i, verbose = verbose, ...)
   })
 
   class(out) <- c("dw_data_tabulates", "list")
   attr(out, "collapse") <- isTRUE(collapse)
+  attr(out, "is_weighted") <- !is.null(weights)
 
   out
 }
@@ -159,6 +193,7 @@ data_tabulate.grouped_df <- function(x,
                                      verbose = TRUE,
                                      collapse = FALSE,
                                      drop_levels = FALSE,
+                                     weights = NULL,
                                      ...) {
   # works only for dplyr >= 0.8.0
   grps <- attr(x, "groups", exact = TRUE)
@@ -191,12 +226,14 @@ data_tabulate.grouped_df <- function(x,
       ignore_case = ignore_case,
       verbose = verbose,
       drop_levels = drop_levels,
+      weights = weights,
       group_variable = group_variable,
       ...
     ))
   }
   class(out) <- c("dw_data_tabulates", "list")
   attr(out, "collapse") <- isTRUE(collapse)
+  attr(out, "is_weighted") <- !is.null(weights)
 
   out
 }
@@ -270,7 +307,12 @@ print.dw_data_tabulate <- function(x, big_mark = NULL, ...) {
   a$valid_n <- .add_commas_in_numbers(a$valid_n, big_mark)
 
   # summary of total and valid N (we may add mean/sd as well?)
-  summary_line <- sprintf("# total N=%s valid N=%s\n\n", a$total_n, a$valid_n)
+  summary_line <- sprintf(
+    "# total N=%s valid N=%s%s\n\n",
+    a$total_n,
+    a$valid_n,
+    ifelse(is.null(a$weights), "", " (weighted)")
+  )
   cat(insight::print_color(summary_line, "blue"))
 
   # remove information that goes into the header/footer
@@ -295,7 +337,12 @@ print_html.dw_data_tabulate <- function(x, big_mark = NULL, ...) {
   caption <- .table_header(x, "html")
 
   # summary of total and valid N (we may add mean/sd as well?)
-  footer <- sprintf("total N=%i valid N=%i\n\n", a$total_n, a$valid_n)
+  footer <- sprintf(
+    "total N=%i valid N=%i%s",
+    a$total_n,
+    a$valid_n,
+    ifelse(is.null(a$weights), "", " (weighted)")
+  )
 
   # remove information that goes into the header/footer
   x$Variable <- NULL
@@ -320,7 +367,12 @@ print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) {
   caption <- .table_header(x, "markdown")
 
   # summary of total and valid N (we may add mean/sd as well?)
-  footer <- sprintf("total N=%i valid N=%i\n\n", a$total_n, a$valid_n)
+  footer <- sprintf(
+    "total N=%i valid N=%i%s\n\n",
+    a$total_n,
+    a$valid_n,
+    ifelse(is.null(a$weights), "", " (weighted)")
+  )
 
   # remove information that goes into the header/footer
   x$Variable <- NULL
@@ -339,6 +391,9 @@ print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) {
 
 #' @export
 print.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
+  # check if we have weights
+  is_weighted <- isTRUE(attributes(x)$is_weighted)
+
   a <- attributes(x)
   if (!isTRUE(a$collapse) || length(x) == 1) {
     for (i in seq_along(x)) {
@@ -356,7 +411,11 @@ print.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
     })
 
     out <- do.call(rbind, x)
-    cat(insight::print_color("# Frequency Table\n\n", "blue"))
+    if (is_weighted) {
+      cat(insight::print_color("# Frequency Table (weighted)\n\n", "blue"))
+    } else {
+      cat(insight::print_color("# Frequency Table\n\n", "blue"))
+    }
 
     # print table
     cat(insight::export_table(
@@ -371,6 +430,9 @@ print.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
 
 #' @export
 print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
+  # check if we have weights
+  is_weighted <- isTRUE(attributes(x)$is_weighted)
+
   if (length(x) == 1) {
     print_html(x[[1]], big_mark = big_mark, ...)
   } else {
@@ -387,7 +449,7 @@ print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
     insight::export_table(
       out,
       missing = "<NA>",
-      caption = "Frequency Table",
+      caption = ifelse(is_weighted, "Frequency Table (weighted)", "Frequency Table"),
       format = "html",
       group_by = "Group"
     )
@@ -397,6 +459,9 @@ print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
 
 #' @export
 print_md.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
+  # check if we have weights
+  is_weighted <- isTRUE(attributes(x)$is_weighted)
+
   if (length(x) == 1) {
     print_md(x[[1]], big_mark = big_mark, ...)
   } else {
@@ -417,7 +482,7 @@ print_md.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
       missing = "(NA)",
       empty_line = "-",
       format = "markdown",
-      title = "Frequency Table"
+      title = ifelse(is_weighted, "Frequency Table (weighted)", "Frequency Table")
     )
   }
 }
diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd
index a69feb83a..5332f386f 100644
--- a/man/data_tabulate.Rd
+++ b/man/data_tabulate.Rd
@@ -8,7 +8,14 @@
 \usage{
 data_tabulate(x, ...)
 
-\method{data_tabulate}{default}(x, drop_levels = FALSE, name = NULL, verbose = TRUE, ...)
+\method{data_tabulate}{default}(
+  x,
+  drop_levels = FALSE,
+  weights = NULL,
+  name = NULL,
+  verbose = TRUE,
+  ...
+)
 
 \method{data_tabulate}{data.frame}(
   x,
@@ -18,6 +25,7 @@ data_tabulate(x, ...)
   regex = FALSE,
   collapse = FALSE,
   drop_levels = FALSE,
+  weights = NULL,
   verbose = TRUE,
   ...
 )
@@ -31,6 +39,9 @@ data_tabulate(x, ...)
 the data are included in the table (with frequency of zero), else unused
 factor levels are dropped from the frequency table.}
 
+\item{weights}{Optional numeric vector of weights. Must be of the same length
+as \code{x}. If \code{weights} is supplied, weighted frequencies are calculated.}
+
 \item{name}{Optional character string, which includes the name that is used
 for printing.}
 
@@ -125,5 +136,10 @@ data_tabulate(x, name = "Large Number")
 
 # to remove the big mark, use "print(..., big_mark = "")"
 print(data_tabulate(x), big_mark = "")
+
+# weighted frequencies
+set.seed(123)
+efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+data_tabulate(efc$e42dep, weights = efc$weights)
 \dontshow{\}) # examplesIf}
 }
diff --git a/tests/testthat/_snaps/data_tabulate.md b/tests/testthat/_snaps/data_tabulate.md
index 77c128c40..7f482ad86 100644
--- a/tests/testthat/_snaps/data_tabulate.md
+++ b/tests/testthat/_snaps/data_tabulate.md
@@ -1,3 +1,82 @@
+# data_tabulate, weights
+
+    Code
+      print(data_tabulate(efc$e42dep, weights = efc$weights))
+    Output
+      elder's dependency (efc$e42dep) <categorical>
+      # total N=105 valid N=100 (weighted)
+      
+      Value |  N | Raw % | Valid % | Cumulative %
+      ------+----+-------+---------+-------------
+      1     |  3 |  2.86 |    3.00 |         3.00
+      2     |  4 |  3.81 |    4.00 |         7.00
+      3     | 26 | 24.76 |   26.00 |        33.00
+      4     | 67 | 63.81 |   67.00 |       100.00
+      <NA>  |  5 |  4.76 |    <NA> |         <NA>
+
+---
+
+    Code
+      print_md(data_tabulate(efc$e42dep, weights = efc$weights))
+    Output
+       [1] "Table: elder's dependency (efc$e42dep) (categorical)"
+       [2] ""                                                    
+       [3] "|Value |  N| Raw %| Valid %| Cumulative %|"          
+       [4] "|:-----|--:|-----:|-------:|------------:|"          
+       [5] "|1     |  3|  2.86|    3.00|         3.00|"          
+       [6] "|2     |  4|  3.81|    4.00|         7.00|"          
+       [7] "|3     | 26| 24.76|   26.00|        33.00|"          
+       [8] "|4     | 67| 63.81|   67.00|       100.00|"          
+       [9] "|(NA)  |  5|  4.76|    (NA)|         (NA)|"          
+      [10] "total N=105 valid N=100 (weighted)\n\n"              
+      attr(,"format")
+      [1] "pipe"
+      attr(,"class")
+      [1] "knitr_kable" "character"  
+
+---
+
+    Code
+      print(data_tabulate(efc, c("e42dep", "e16sex"), collapse = TRUE, weights = efc$
+        weights))
+    Output
+      # Frequency Table (weighted)
+      
+      Variable | Value |  N | Raw % | Valid % | Cumulative %
+      ---------+-------+----+-------+---------+-------------
+      e42dep   |     1 |  3 |  2.86 |    3.00 |         3.00
+               |     2 |  4 |  3.81 |    4.00 |         7.00
+               |     3 | 26 | 24.76 |   26.00 |        33.00
+               |     4 | 67 | 63.81 |   67.00 |       100.00
+               |  <NA> |  5 |  4.76 |    <NA> |         <NA>
+      ---------+-------+----+-------+---------+-------------
+      e16sex   |     1 | 50 | 47.62 |  100.00 |       100.00
+               |     2 | 55 | 52.38 |    <NA> |         <NA>
+      ------------------------------------------------------
+
+---
+
+    Code
+      print_md(data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights))
+    Output
+       [1] "Table: Frequency Table (weighted)"                   
+       [2] ""                                                    
+       [3] "|Variable | Value|  N| Raw %| Valid %| Cumulative %|"
+       [4] "|:--------|-----:|--:|-----:|-------:|------------:|"
+       [5] "|e42dep   |     1|  3|  2.86|    3.00|         3.00|"
+       [6] "|         |     2|  4|  3.81|    4.00|         7.00|"
+       [7] "|         |     3| 26| 24.76|   26.00|        33.00|"
+       [8] "|         |     4| 67| 63.81|   67.00|       100.00|"
+       [9] "|         |  (NA)|  5|  4.76|    (NA)|         (NA)|"
+      [10] "|         |      |   |      |        |             |"
+      [11] "|e16sex   |     1| 50| 47.62|  100.00|       100.00|"
+      [12] "|         |     2| 55| 52.38|    (NA)|         (NA)|"
+      [13] "|         |      |   |      |        |             |"
+      attr(,"format")
+      [1] "pipe"
+      attr(,"class")
+      [1] "knitr_kable" "character"  
+
 # data_tabulate print
 
     Code
diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R
index bf8010b3f..4390c1af5 100644
--- a/tests/testthat/test-data_tabulate.R
+++ b/tests/testthat/test-data_tabulate.R
@@ -32,6 +32,35 @@ test_that("data_tabulate numeric", {
 })
 
 
+test_that("data_tabulate, weights", {
+  data(efc, package = "datawizard")
+  set.seed(123)
+  efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
+  # vector/factor
+  out1 <- data_tabulate(efc$e42dep, weights = efc$weights)
+  out2 <- data_tabulate(efc$e42dep)
+  expect_equal(out1$N, c(3, 4, 26, 67, 5), ignore_attr = TRUE)
+  expect_equal(out2$N, c(2L, 4L, 28L, 63L, 3L), ignore_attr = TRUE)
+  expect_equal(
+    out1$N,
+    round(xtabs(efc$weights ~ efc$e42dep, addNA = TRUE)),
+    ignore_attr = TRUE
+  )
+  # data frames
+  out <- data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights)
+  expect_equal(out[[1]]$N, out1$N, ignore_attr = TRUE)
+  # mismatch of lengths
+  w <- c(efc$weights, 1)
+  expect_error(data_tabulate(efc$e42dep, weights = w), regex = "Length of `weights`")
+  # correct table footer
+  expect_snapshot(print(data_tabulate(efc$e42dep, weights = efc$weights)))
+  expect_snapshot(print_md(data_tabulate(efc$e42dep, weights = efc$weights)))
+  # correct table caption
+  expect_snapshot(print(data_tabulate(efc, c("e42dep", "e16sex"), collapse = TRUE, weights = efc$weights)))
+  expect_snapshot(print_md(data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights)))
+})
+
+
 test_that("data_tabulate data.frame", {
   x <- data_tabulate(efc, c("e16sex", "c172code"))
   expect_s3_class(x, "list")
@@ -89,7 +118,7 @@ test_that("data_tabulate data.frame", {
 
 test_that("data_tabulate print", {
   set.seed(123)
-  x <- sample(1:3, 1e6, TRUE)
+  x <- sample.int(3, 1e6, TRUE)
   out <- data_tabulate(x, name = "Large Number")
   expect_identical(
     attributes(out),
@@ -120,7 +149,7 @@ test_that("data_tabulate print multiple", {
 
 test_that("data_tabulate big numbers", {
   set.seed(123)
-  x <- sample(1:5, size = 1e7, TRUE)
+  x <- sample.int(5, size = 1e7, TRUE)
   expect_snapshot(data_tabulate(x))
   expect_snapshot(print(data_tabulate(x), big_mark = "-"))
 })