From 3be20ad3739ca534eac197a571b752146de1cb90 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 7 Feb 2024 16:07:22 +0100 Subject: [PATCH] `data_tabulate()` gains a `weights` argument (#479) * `data_tabulate()` gains a `weights` argument * add tests * tests * lintr * news, comment * typo * fix test, remove whitespace --- DESCRIPTION | 2 +- NEWS.md | 2 + R/data_modify.R | 2 +- R/data_tabulate.R | 83 +++++++++++++++++++++++--- man/data_tabulate.Rd | 18 +++++- tests/testthat/_snaps/data_tabulate.md | 79 ++++++++++++++++++++++++ tests/testthat/test-data_tabulate.R | 33 +++++++++- 7 files changed, 205 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 41a1744fd..2d17aff8e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.9.1.1 +Version: 0.9.1.2 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NEWS.md b/NEWS.md index 25d043259..ffb41206d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,8 @@ CHANGES * `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify variables at specific positions or based on logical conditions. +* `data_tabulate()` gets a `weights` argument, to compute weighted frequency tables. + # datawizard 0.9.1 CHANGES diff --git a/R/data_modify.R b/R/data_modify.R index 063ecc325..30b21e460 100644 --- a/R/data_modify.R +++ b/R/data_modify.R @@ -122,7 +122,7 @@ #' .at = c("Species", "new_length"), #' .modify = as.numeric #' )} -#' +#' #' # combine "data_find()" and ".at" argument #' out <- data_modify( #' d, diff --git a/R/data_tabulate.R b/R/data_tabulate.R index 7a56e6a87..4d3bb810b 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -13,6 +13,8 @@ #' for printing. #' @param collapse Logical, if `TRUE` collapses multiple tables into one larger #' table for printing. This affects only printing, not the returned object. +#' @param weights Optional numeric vector of weights. Must be of the same length +#' as `x`. If `weights` is supplied, weighted frequencies are calculated. #' @param ... not used. #' @inheritParams find_columns #' @@ -46,6 +48,12 @@ #' #' # to remove the big mark, use "print(..., big_mark = "")" #' print(data_tabulate(x), big_mark = "") +#' +#' # weighted frequencies +#' set.seed(123) +#' efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) +#' data_tabulate(efc$e42dep, weights = efc$weights) +#' #' @export data_tabulate <- function(x, ...) { UseMethod("data_tabulate") @@ -54,7 +62,7 @@ data_tabulate <- function(x, ...) { #' @rdname data_tabulate #' @export -data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose = TRUE, ...) { +data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name = NULL, verbose = TRUE, ...) { # save label attribute, before it gets lost... var_label <- attr(x, "label", exact = TRUE) @@ -70,8 +78,26 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose = x <- droplevels(x) } + # check for correct length of weights - must be equal to "x" + if (!is.null(weights) && length(weights) != length(x)) { + insight::format_error("Length of `weights` must be equal to length of `x`.") + } + # frequency table - freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) + if (is.null(weights)) { + freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL) + } else { + # weighted frequency table + freq_table <- tryCatch( + stats::xtabs( + weights ~ x, + data = data.frame(weights = weights, x = x), + na.action = stats::na.pass, + addNA = TRUE + ), + error = function(e) NULL + ) + } if (is.null(freq_table)) { insight::format_warning(paste0("Can't compute frequency tables for objects of class `", class(x)[1], "`.")) @@ -83,6 +109,11 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose = replacement = c("Value", "N") ) + # we want to round N for weighted frequencies + if (!is.null(weights)) { + out$N <- round(out$N) + } + out$`Raw %` <- 100 * out$N / sum(out$N) out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA) out$`Cumulative %` <- cumsum(out$`Valid %`) @@ -110,6 +141,7 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose = attr(out, "object") <- obj_name attr(out, "group_variable") <- group_variable attr(out, "duplicate_varnames") <- duplicated(out$Variable) + attr(out, "weights") <- weights attr(out, "total_n") <- sum(out$N, na.rm = TRUE) attr(out, "valid_n") <- sum(out$N[-length(out$N)], na.rm = TRUE) @@ -129,6 +161,7 @@ data_tabulate.data.frame <- function(x, regex = FALSE, collapse = FALSE, drop_levels = FALSE, + weights = NULL, verbose = TRUE, ...) { # evaluate arguments @@ -140,11 +173,12 @@ data_tabulate.data.frame <- function(x, verbose = verbose ) out <- lapply(select, function(i) { - data_tabulate(x[[i]], drop_levels = drop_levels, name = i, verbose = verbose, ...) + data_tabulate(x[[i]], drop_levels = drop_levels, weights = weights, name = i, verbose = verbose, ...) }) class(out) <- c("dw_data_tabulates", "list") attr(out, "collapse") <- isTRUE(collapse) + attr(out, "is_weighted") <- !is.null(weights) out } @@ -159,6 +193,7 @@ data_tabulate.grouped_df <- function(x, verbose = TRUE, collapse = FALSE, drop_levels = FALSE, + weights = NULL, ...) { # works only for dplyr >= 0.8.0 grps <- attr(x, "groups", exact = TRUE) @@ -191,12 +226,14 @@ data_tabulate.grouped_df <- function(x, ignore_case = ignore_case, verbose = verbose, drop_levels = drop_levels, + weights = weights, group_variable = group_variable, ... )) } class(out) <- c("dw_data_tabulates", "list") attr(out, "collapse") <- isTRUE(collapse) + attr(out, "is_weighted") <- !is.null(weights) out } @@ -270,7 +307,12 @@ print.dw_data_tabulate <- function(x, big_mark = NULL, ...) { a$valid_n <- .add_commas_in_numbers(a$valid_n, big_mark) # summary of total and valid N (we may add mean/sd as well?) - summary_line <- sprintf("# total N=%s valid N=%s\n\n", a$total_n, a$valid_n) + summary_line <- sprintf( + "# total N=%s valid N=%s%s\n\n", + a$total_n, + a$valid_n, + ifelse(is.null(a$weights), "", " (weighted)") + ) cat(insight::print_color(summary_line, "blue")) # remove information that goes into the header/footer @@ -295,7 +337,12 @@ print_html.dw_data_tabulate <- function(x, big_mark = NULL, ...) { caption <- .table_header(x, "html") # summary of total and valid N (we may add mean/sd as well?) - footer <- sprintf("total N=%i valid N=%i\n\n", a$total_n, a$valid_n) + footer <- sprintf( + "total N=%i valid N=%i%s", + a$total_n, + a$valid_n, + ifelse(is.null(a$weights), "", " (weighted)") + ) # remove information that goes into the header/footer x$Variable <- NULL @@ -320,7 +367,12 @@ print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) { caption <- .table_header(x, "markdown") # summary of total and valid N (we may add mean/sd as well?) - footer <- sprintf("total N=%i valid N=%i\n\n", a$total_n, a$valid_n) + footer <- sprintf( + "total N=%i valid N=%i%s\n\n", + a$total_n, + a$valid_n, + ifelse(is.null(a$weights), "", " (weighted)") + ) # remove information that goes into the header/footer x$Variable <- NULL @@ -339,6 +391,9 @@ print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) { #' @export print.dw_data_tabulates <- function(x, big_mark = NULL, ...) { + # check if we have weights + is_weighted <- isTRUE(attributes(x)$is_weighted) + a <- attributes(x) if (!isTRUE(a$collapse) || length(x) == 1) { for (i in seq_along(x)) { @@ -356,7 +411,11 @@ print.dw_data_tabulates <- function(x, big_mark = NULL, ...) { }) out <- do.call(rbind, x) - cat(insight::print_color("# Frequency Table\n\n", "blue")) + if (is_weighted) { + cat(insight::print_color("# Frequency Table (weighted)\n\n", "blue")) + } else { + cat(insight::print_color("# Frequency Table\n\n", "blue")) + } # print table cat(insight::export_table( @@ -371,6 +430,9 @@ print.dw_data_tabulates <- function(x, big_mark = NULL, ...) { #' @export print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) { + # check if we have weights + is_weighted <- isTRUE(attributes(x)$is_weighted) + if (length(x) == 1) { print_html(x[[1]], big_mark = big_mark, ...) } else { @@ -387,7 +449,7 @@ print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) { insight::export_table( out, missing = "", - caption = "Frequency Table", + caption = ifelse(is_weighted, "Frequency Table (weighted)", "Frequency Table"), format = "html", group_by = "Group" ) @@ -397,6 +459,9 @@ print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) { #' @export print_md.dw_data_tabulates <- function(x, big_mark = NULL, ...) { + # check if we have weights + is_weighted <- isTRUE(attributes(x)$is_weighted) + if (length(x) == 1) { print_md(x[[1]], big_mark = big_mark, ...) } else { @@ -417,7 +482,7 @@ print_md.dw_data_tabulates <- function(x, big_mark = NULL, ...) { missing = "(NA)", empty_line = "-", format = "markdown", - title = "Frequency Table" + title = ifelse(is_weighted, "Frequency Table (weighted)", "Frequency Table") ) } } diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index a69feb83a..5332f386f 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -8,7 +8,14 @@ \usage{ data_tabulate(x, ...) -\method{data_tabulate}{default}(x, drop_levels = FALSE, name = NULL, verbose = TRUE, ...) +\method{data_tabulate}{default}( + x, + drop_levels = FALSE, + weights = NULL, + name = NULL, + verbose = TRUE, + ... +) \method{data_tabulate}{data.frame}( x, @@ -18,6 +25,7 @@ data_tabulate(x, ...) regex = FALSE, collapse = FALSE, drop_levels = FALSE, + weights = NULL, verbose = TRUE, ... ) @@ -31,6 +39,9 @@ data_tabulate(x, ...) the data are included in the table (with frequency of zero), else unused factor levels are dropped from the frequency table.} +\item{weights}{Optional numeric vector of weights. Must be of the same length +as \code{x}. If \code{weights} is supplied, weighted frequencies are calculated.} + \item{name}{Optional character string, which includes the name that is used for printing.} @@ -125,5 +136,10 @@ data_tabulate(x, name = "Large Number") # to remove the big mark, use "print(..., big_mark = "")" print(data_tabulate(x), big_mark = "") + +# weighted frequencies +set.seed(123) +efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) +data_tabulate(efc$e42dep, weights = efc$weights) \dontshow{\}) # examplesIf} } diff --git a/tests/testthat/_snaps/data_tabulate.md b/tests/testthat/_snaps/data_tabulate.md index 77c128c40..7f482ad86 100644 --- a/tests/testthat/_snaps/data_tabulate.md +++ b/tests/testthat/_snaps/data_tabulate.md @@ -1,3 +1,82 @@ +# data_tabulate, weights + + Code + print(data_tabulate(efc$e42dep, weights = efc$weights)) + Output + elder's dependency (efc$e42dep) + # total N=105 valid N=100 (weighted) + + Value | N | Raw % | Valid % | Cumulative % + ------+----+-------+---------+------------- + 1 | 3 | 2.86 | 3.00 | 3.00 + 2 | 4 | 3.81 | 4.00 | 7.00 + 3 | 26 | 24.76 | 26.00 | 33.00 + 4 | 67 | 63.81 | 67.00 | 100.00 + | 5 | 4.76 | | + +--- + + Code + print_md(data_tabulate(efc$e42dep, weights = efc$weights)) + Output + [1] "Table: elder's dependency (efc$e42dep) (categorical)" + [2] "" + [3] "|Value | N| Raw %| Valid %| Cumulative %|" + [4] "|:-----|--:|-----:|-------:|------------:|" + [5] "|1 | 3| 2.86| 3.00| 3.00|" + [6] "|2 | 4| 3.81| 4.00| 7.00|" + [7] "|3 | 26| 24.76| 26.00| 33.00|" + [8] "|4 | 67| 63.81| 67.00| 100.00|" + [9] "|(NA) | 5| 4.76| (NA)| (NA)|" + [10] "total N=105 valid N=100 (weighted)\n\n" + attr(,"format") + [1] "pipe" + attr(,"class") + [1] "knitr_kable" "character" + +--- + + Code + print(data_tabulate(efc, c("e42dep", "e16sex"), collapse = TRUE, weights = efc$ + weights)) + Output + # Frequency Table (weighted) + + Variable | Value | N | Raw % | Valid % | Cumulative % + ---------+-------+----+-------+---------+------------- + e42dep | 1 | 3 | 2.86 | 3.00 | 3.00 + | 2 | 4 | 3.81 | 4.00 | 7.00 + | 3 | 26 | 24.76 | 26.00 | 33.00 + | 4 | 67 | 63.81 | 67.00 | 100.00 + | | 5 | 4.76 | | + ---------+-------+----+-------+---------+------------- + e16sex | 1 | 50 | 47.62 | 100.00 | 100.00 + | 2 | 55 | 52.38 | | + ------------------------------------------------------ + +--- + + Code + print_md(data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights)) + Output + [1] "Table: Frequency Table (weighted)" + [2] "" + [3] "|Variable | Value| N| Raw %| Valid %| Cumulative %|" + [4] "|:--------|-----:|--:|-----:|-------:|------------:|" + [5] "|e42dep | 1| 3| 2.86| 3.00| 3.00|" + [6] "| | 2| 4| 3.81| 4.00| 7.00|" + [7] "| | 3| 26| 24.76| 26.00| 33.00|" + [8] "| | 4| 67| 63.81| 67.00| 100.00|" + [9] "| | (NA)| 5| 4.76| (NA)| (NA)|" + [10] "| | | | | | |" + [11] "|e16sex | 1| 50| 47.62| 100.00| 100.00|" + [12] "| | 2| 55| 52.38| (NA)| (NA)|" + [13] "| | | | | | |" + attr(,"format") + [1] "pipe" + attr(,"class") + [1] "knitr_kable" "character" + # data_tabulate print Code diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R index bf8010b3f..4390c1af5 100644 --- a/tests/testthat/test-data_tabulate.R +++ b/tests/testthat/test-data_tabulate.R @@ -32,6 +32,35 @@ test_that("data_tabulate numeric", { }) +test_that("data_tabulate, weights", { + data(efc, package = "datawizard") + set.seed(123) + efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5)) + # vector/factor + out1 <- data_tabulate(efc$e42dep, weights = efc$weights) + out2 <- data_tabulate(efc$e42dep) + expect_equal(out1$N, c(3, 4, 26, 67, 5), ignore_attr = TRUE) + expect_equal(out2$N, c(2L, 4L, 28L, 63L, 3L), ignore_attr = TRUE) + expect_equal( + out1$N, + round(xtabs(efc$weights ~ efc$e42dep, addNA = TRUE)), + ignore_attr = TRUE + ) + # data frames + out <- data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights) + expect_equal(out[[1]]$N, out1$N, ignore_attr = TRUE) + # mismatch of lengths + w <- c(efc$weights, 1) + expect_error(data_tabulate(efc$e42dep, weights = w), regex = "Length of `weights`") + # correct table footer + expect_snapshot(print(data_tabulate(efc$e42dep, weights = efc$weights))) + expect_snapshot(print_md(data_tabulate(efc$e42dep, weights = efc$weights))) + # correct table caption + expect_snapshot(print(data_tabulate(efc, c("e42dep", "e16sex"), collapse = TRUE, weights = efc$weights))) + expect_snapshot(print_md(data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights))) +}) + + test_that("data_tabulate data.frame", { x <- data_tabulate(efc, c("e16sex", "c172code")) expect_s3_class(x, "list") @@ -89,7 +118,7 @@ test_that("data_tabulate data.frame", { test_that("data_tabulate print", { set.seed(123) - x <- sample(1:3, 1e6, TRUE) + x <- sample.int(3, 1e6, TRUE) out <- data_tabulate(x, name = "Large Number") expect_identical( attributes(out), @@ -120,7 +149,7 @@ test_that("data_tabulate print multiple", { test_that("data_tabulate big numbers", { set.seed(123) - x <- sample(1:5, size = 1e7, TRUE) + x <- sample.int(5, size = 1e7, TRUE) expect_snapshot(data_tabulate(x)) expect_snapshot(print(data_tabulate(x), big_mark = "-")) })