diff --git a/DESCRIPTION b/DESCRIPTION index 68cfb6741..034c823ed 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.19 +Version: 0.13.0.20 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NAMESPACE b/NAMESPACE index 7e97817b9..e463f7261 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -267,6 +267,7 @@ export(data_write) export(degroup) export(demean) export(describe_distribution) +export(describe_missing) export(detrend) export(distribution_coef_var) export(distribution_mode) diff --git a/NEWS.md b/NEWS.md index 35e549ffa..0e9dc28a9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,10 @@ BREAKING CHANGES AND DEPRECATIONS - if `select` (previously `pattern`) is a named vector, then all elements must be named, e.g. `c(length = "Sepal.Length", "Sepal.Width")` errors. +NEW FUNCTIONS + +* `describe_missing()`, to report on missing values in a data frame. + CHANGES * The `select` argument, which is available in different functions to select diff --git a/R/describe_missing.R b/R/describe_missing.R new file mode 100644 index 000000000..620d14da1 --- /dev/null +++ b/R/describe_missing.R @@ -0,0 +1,112 @@ +#' @title Describe Missing Values in Data According to Guidelines +#' +#' @description Provides a detailed description of missing values in a data frame. +#' This function reports both absolute and percentage missing values of specified +#' variables. +#' +#' @inheritParams extract_column_names +#' @param by Optional character string, indicating the names of one or more +#' variables in the data frame. If supplied, the data will be split by these +#' variables and summary statistics will be computed for each group. Useful +#' for survey data by first reshaping the data to the long format. +#' @param sort Logical. Whether to sort the result from highest to lowest +#' percentage of missing data. +#' @return A dataframe with the following columns: +#' - `variable`: Variables selected. +#' - `n_missing`: Number of missing values. +#' - `missing_percent`: Percentage of missing values. +#' - `complete_percent`: Percentage of non-missing values. +#' @param ... Arguments passed down to other functions. Currently not used. +#' +#' @export +#' @examples +#' describe_missing(airquality) +#' +#' # Survey data +#' set.seed(15) +#' fun <- function() { +#' c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) +#' } +#' df <- data.frame( +#' ID = c("idz", NA), +#' openness_1 = fun(), openness_2 = fun(), openness_3 = fun(), +#' extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(), +#' agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun() +#' ) +#' +#' df_long <- reshape_longer( +#' df, +#' select = -1, +#' names_sep = "_", +#' names_to = c("dimension", "item") +#' ) +#' +#' describe_missing( +#' df_long, +#' select = -c(1, 3), +#' by = "dimension" +#' ) +#' +describe_missing <- function(data, + select = NULL, + exclude = NULL, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE, + by = NULL, + sort = FALSE, + ...) { + if (!is.null(select) || !is.null(exclude)) { + data <- data_select( + data = data, + select = select, + exclude = exclude, + ignore_case = ignore_case, + regex = regex, + verbose = verbose, + ... + ) + } + if (is.null(by)) { + na_list <- lapply(names(data), function(x) { + data_subset <- data[, x, drop = FALSE] + .describe_missing(data_subset) + }) + } else { + if (!by %in% names(data)) { + stop("The 'by' column does not exist in the data.", call. = FALSE) + } + grouped_data <- split(data, data[[by]]) + na_list <- lapply(names(grouped_data), function(group_name) { + group <- grouped_data[[group_name]] + # Identify columns to analyze (exclude the 'by' column) + cols_to_analyze <- setdiff(names(group), by) + group_na_list <- lapply(cols_to_analyze, function(x) { + data_subset <- group[, x, drop = FALSE] + .describe_missing(data_subset) + }) + group_na_df <- do.call(rbind, group_na_list) + group_na_df$variable <- group_name + group_na_df + }) + } + na_df <- do.call(rbind, na_list) + if (isTRUE(sort)) { + na_df <- na_df[order(-na_df$missing_percent), ] + } + na_df_tot <- .describe_missing(data) + na_df_tot$variable <- "Total" + na_df <- rbind(na_df, na_df_tot) + na_df +} + +.describe_missing <- function(data) { + n_missing <- sum(is.na(data)) + missing_percent <- round(n_missing / (nrow(data) * ncol(data)) * 100, 2) + data.frame( + variable = names(data)[1], + n_missing = n_missing, + missing_percent = missing_percent, + complete_percent = 100 - missing_percent + ) +} diff --git a/inst/WORDLIST b/inst/WORDLIST index a8b4ff08d..bbafb3bd2 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -78,6 +78,7 @@ labelling leptokurtic lm lme +macOS meaned mesokurtic midhinge @@ -91,6 +92,8 @@ poorman pre pth px +quartile +quartiles readr readxl recode diff --git a/man/describe_missing.Rd b/man/describe_missing.Rd new file mode 100644 index 000000000..daf863738 --- /dev/null +++ b/man/describe_missing.Rd @@ -0,0 +1,132 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/describe_missing.R +\name{describe_missing} +\alias{describe_missing} +\title{Describe Missing Values in Data According to Guidelines} +\usage{ +describe_missing( + data, + select = NULL, + exclude = NULL, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE, + by = NULL, + sort = FALSE, + ... +) +} +\arguments{ +\item{data}{A data frame.} + +\item{select}{Variables that will be included when performing the required +tasks. Can be either +\itemize{ +\item a variable specified as a literal variable name (e.g., \code{column_name}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), +\item for some functions, like \code{data_select()} or \code{data_rename()}, \code{select} can +be a named character vector. In this case, the names are used to rename +the columns in the output data frame. See 'Details' in the related +functions to see where this option applies. +\item a formula with variable names (e.g., \code{~column_1 + column_2}), +\item a vector of positive integers, giving the positions counting from the left +(e.g. \code{1} or \code{c(1, 3, 5)}), +\item a vector of negative integers, giving the positions counting from the +right (e.g., \code{-1} or \code{-1:-3}), +\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, +\code{contains()}, a range using \code{:}, or \code{regex()}. \code{starts_with()}, +\code{ends_with()}, and \code{contains()} accept several patterns, e.g +\code{starts_with("Sep", "Petal")}. \code{regex()} can be used to define regular +expression patterns. +\item a function testing for logical conditions, e.g. \code{is.numeric()} (or +\code{is.numeric}), or any user-defined function that selects the variables +for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), +\item ranges specified via literal variable names, select-helpers (except +\code{regex()}) and (user-defined) functions can be negated, i.e. return +non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with()}, +\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means +that matches are \emph{excluded}, and thus, the \code{exclude} argument can be +used alternatively. For instance, \code{select=-ends_with("Length")} (with +\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case +negation should not work as expected, use the \code{exclude} argument instead. +} + +If \code{NULL}, selects all columns. Patterns that found no matches are silently +ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} +will just return \code{"Species"}.} + +\item{exclude}{See \code{select}, however, column names matched by the pattern +from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), +excludes no columns.} + +\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or +a regular expression is used in \code{select}, ignores lower/upper case in the +search pattern when matching against variable names.} + +\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be +treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a +character string (or a variable containing a character string) and is not +allowed to be one of the supported select-helpers or a character vector +of length > 1. \code{regex = TRUE} is comparable to using one of the two +select-helpers, \code{select = contains()} or \code{select = regex()}, however, +since the select-helpers may not work when called from inside other +functions (see 'Details'), this argument may be used as workaround.} + +\item{verbose}{Toggle warnings.} + +\item{by}{Optional character string, indicating the names of one or more +variables in the data frame. If supplied, the data will be split by these +variables and summary statistics will be computed for each group. Useful +for survey data by first reshaping the data to the long format.} + +\item{sort}{Logical. Whether to sort the result from highest to lowest +percentage of missing data.} + +\item{...}{Arguments passed down to other functions. Currently not used.} +} +\value{ +A dataframe with the following columns: +\itemize{ +\item \code{variable}: Variables selected. +\item \code{n_missing}: Number of missing values. +\item \code{missing_percent}: Percentage of missing values. +\item \code{complete_percent}: Percentage of non-missing values. +} +} +\description{ +Provides a detailed description of missing values in a data frame. +This function reports both absolute and percentage missing values of specified +variables. +} +\examples{ +describe_missing(airquality) + +# Survey data +set.seed(15) +fun <- function() { + c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) +} +df <- data.frame( + ID = c("idz", NA), + openness_1 = fun(), openness_2 = fun(), openness_3 = fun(), + extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(), + agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun() +) + +df_long <- reshape_longer( + df, + select = -1, + names_sep = "_", + names_to = c("dimension", "item") +) + +describe_missing( + df_long, + select = -c(1, 3), + by = "dimension" +) + +} diff --git a/pkgdown/_pkgdown.yaml b/pkgdown/_pkgdown.yaml index 01b701f03..ce6b59242 100644 --- a/pkgdown/_pkgdown.yaml +++ b/pkgdown/_pkgdown.yaml @@ -66,6 +66,7 @@ reference: - data_tabulate - data_peek - data_seek + - describe_missing - means_by_group - contains("distribution") - kurtosis diff --git a/tests/testthat/_snaps/describe_missing.md b/tests/testthat/_snaps/describe_missing.md new file mode 100644 index 000000000..5c598e2de --- /dev/null +++ b/tests/testthat/_snaps/describe_missing.md @@ -0,0 +1,61 @@ +# describe_missing + + Code + describe_missing(airquality2) + Output + variable n_missing missing_percent complete_percent + 1 Solar.R 7 4.58 95.42 + 2 Wind 0 0.00 100.00 + 3 Temp 0 0.00 100.00 + 4 Month 0 0.00 100.00 + 5 Day 0 0.00 100.00 + 6 Ozone 37 24.18 75.82 + 7 Total 44 4.79 95.21 + +--- + + Code + describe_missing(airquality2, sort = TRUE) + Output + variable n_missing missing_percent complete_percent + 6 Ozone 37 24.18 75.82 + 1 Solar.R 7 4.58 95.42 + 2 Wind 0 0.00 100.00 + 3 Temp 0 0.00 100.00 + 4 Month 0 0.00 100.00 + 5 Day 0 0.00 100.00 + 11 Total 44 4.79 95.21 + +--- + + Code + describe_missing(airquality2, select = "Ozone:Temp") + Output + variable n_missing missing_percent complete_percent + 1 Ozone 37 24.18 75.82 + 2 Day 0 0.00 100.00 + 3 Month 0 0.00 100.00 + 4 Temp 0 0.00 100.00 + 5 Total 37 6.05 93.95 + +--- + + Code + describe_missing(airquality2, exclude = "Ozone:Temp") + Output + variable n_missing missing_percent complete_percent + 1 Solar.R 7 4.58 95.42 + 2 Wind 0 0.00 100.00 + 3 Total 7 2.29 97.71 + +--- + + Code + describe_missing(df_long, select = -c(1, 3), by = "dimension") + Output + variable n_missing missing_percent complete_percent + 1 agreeableness 10 23.81 76.19 + 2 extroversion 17 40.48 59.52 + 3 openness 11 26.19 73.81 + 4 Total 38 15.08 84.92 + diff --git a/tests/testthat/test-describe_missing.R b/tests/testthat/test-describe_missing.R new file mode 100644 index 000000000..d26758cca --- /dev/null +++ b/tests/testthat/test-describe_missing.R @@ -0,0 +1,43 @@ +test_that("describe_missing", { + airquality2 <- cbind(airquality[2:6], airquality[1]) + + expect_snapshot(describe_missing(airquality2)) + + expect_snapshot(describe_missing(airquality2, sort = TRUE)) + + expect_snapshot(describe_missing( + airquality2, + select = "Ozone:Temp" + )) + + expect_snapshot(describe_missing( + airquality2, + exclude = "Ozone:Temp" + )) + + # Testing the 'by' argument for survey scales + set.seed(15) + fun <- function() { + c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA) + } + df <- data.frame( + ID = c("idz", NA), + openness_1 = fun(), openness_2 = fun(), openness_3 = fun(), + extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(), + agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun(), + stringsAsFactors = FALSE + ) + + # Pivot and group using datawizard + df_long <- reshape_longer(df, + select = -1, + names_sep = "_", + names_to = c("dimension", "item") + ) + + # Run describe_missing with 'by' argument + expect_snapshot(describe_missing( + df_long, + select = -c(1, 3), by = "dimension" + )) +})