easystats · rempsyc · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.13.0.19
+Version: 0.13.0.20
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -267,6 +267,7 @@ export(data_write)
 export(degroup)
 export(demean)
 export(describe_distribution)
+export(describe_missing)
 export(detrend)
 export(distribution_coef_var)
 export(distribution_mode)

diff --git a/NEWS.md b/NEWS.md
@@ -16,6 +16,10 @@ BREAKING CHANGES AND DEPRECATIONS
   - if `select` (previously `pattern`) is a named vector, then all elements
     must be named, e.g. `c(length = "Sepal.Length", "Sepal.Width")` errors.
 
+NEW FUNCTIONS
+
+* `describe_missing()`, to report on missing values in a data frame.
+
 CHANGES
 
 * The `select` argument, which is available in different functions to select

diff --git a/R/describe_missing.R b/R/describe_missing.R
@@ -0,0 +1,112 @@
+#' @title Describe Missing Values in Data According to Guidelines
+#'
+#' @description Provides a detailed description of missing values in a data frame.
+#' This function reports both absolute and percentage missing values of specified
+#' variables.
+#'
+#' @inheritParams extract_column_names
+#' @param by Optional character string, indicating the names of one or more
+#' variables in the data frame. If supplied, the data will be split by these
+#' variables and summary statistics will be computed for each group. Useful
+#' for survey data by first reshaping the data to the long format.
+#' @param sort Logical. Whether to sort the result from highest to lowest
+#' percentage of missing data.
+#' @return A dataframe with the following columns:
+#'  - `variable`: Variables selected.
+#'  - `n_missing`: Number of missing values.
+#'  - `missing_percent`: Percentage of missing values.
+#'  - `complete_percent`: Percentage of non-missing values.
+#' @param ... Arguments passed down to other functions. Currently not used.
+#'
+#' @export
+#' @examples
+#' describe_missing(airquality)
+#'
+#' # Survey data
+#' set.seed(15)
+#' fun <- function() {
+#'   c(sample(c(NA, 1:10), replace = TRUE), NA, NA, NA)
+#' }
+#' df <- data.frame(
+#'   ID = c("idz", NA),
+#'   openness_1 = fun(), openness_2 = fun(), openness_3 = fun(),
+#'   extroversion_1 = fun(), extroversion_2 = fun(), extroversion_3 = fun(),
+#'   agreeableness_1 = fun(), agreeableness_2 = fun(), agreeableness_3 = fun()
+#' )
+#'
+#' df_long <- reshape_longer(
+#'   df,
+#'   select = -1,
+#'   names_sep = "_",
+#'   names_to = c("dimension", "item")
+#' )
+#'
+#' describe_missing(
+#'   df_long,
+#'   select = -c(1, 3),
+#'   by = "dimension"
+#' )
+#'
+describe_missing <- function(data,
+                             select = NULL,
+                             exclude = NULL,
+                             ignore_case = FALSE,
+                             regex = FALSE,
+                             verbose = TRUE,
+                             by = NULL,
+                             sort = FALSE,
+                             ...) {
+  if (!is.null(select) || !is.null(exclude)) {
+    data <- data_select(
+      data = data,
+      select = select,
+      exclude = exclude,
+      ignore_case = ignore_case,
+      regex = regex,
+      verbose = verbose,
+      ...
+    )
+  }
+  if (is.null(by)) {
+    na_list <- lapply(names(data), function(x) {
+      data_subset <- data[, x, drop = FALSE]
+      .describe_missing(data_subset)
+    })
+  } else {
+    if (!by %in% names(data)) {
+      stop("The 'by' column does not exist in the data.", call. = FALSE)
+    }
+    grouped_data <- split(data, data[[by]])
+    na_list <- lapply(names(grouped_data), function(group_name) {
+      group <- grouped_data[[group_name]]
+      # Identify columns to analyze (exclude the 'by' column)
+      cols_to_analyze <- setdiff(names(group), by)
+      group_na_list <- lapply(cols_to_analyze, function(x) {
+        data_subset <- group[, x, drop = FALSE]
+        .describe_missing(data_subset)
+      })
+      group_na_df <- do.call(rbind, group_na_list)
+      group_na_df$variable <- group_name
+      group_na_df
+    })
+  }
+  na_df <- do.call(rbind, na_list)
+  if (isTRUE(sort)) {
+    na_df <- na_df[order(-na_df$missing_percent), ]
+  }
+  na_df_tot <- .describe_missing(data)
+  na_df_tot$variable <- "Total"
+  na_df <- rbind(na_df, na_df_tot)
+  na_df
+}
+
+.describe_missing <- function(data) {
+  n_missing <- sum(is.na(data))
+  missing_percent <- round(n_missing / (nrow(data) * ncol(data)) * 100, 2)
+  data.frame(
+    variable = names(data)[1],
+    n_missing = n_missing,
+    missing_percent = missing_percent,
+    complete_percent = 100 - missing_percent
+  )
+}
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -78,6 +78,7 @@ labelling
 leptokurtic
 lm
 lme
+macOS
 meaned
 mesokurtic
 midhinge
@@ -91,6 +92,8 @@ poorman
 pre
 pth
 px
+quartile
+quartiles
 readr
 readxl
 recode

diff --git a/man/describe_missing.Rd b/man/describe_missing.Rd
diff --git a/pkgdown/_pkgdown.yaml b/pkgdown/_pkgdown.yaml
@@ -66,6 +66,7 @@ reference:
       - data_tabulate
       - data_peek
       - data_seek
+      - describe_missing
       - means_by_group
       - contains("distribution")
       - kurtosis

diff --git a/tests/testthat/_snaps/describe_missing.md b/tests/testthat/_snaps/describe_missing.md
@@ -0,0 +1,61 @@
+# describe_missing
+
+    Code
+      describe_missing(airquality2)
+    Output
+        variable n_missing missing_percent complete_percent
+      1  Solar.R         7            4.58            95.42
+      2     Wind         0            0.00           100.00
+      3     Temp         0            0.00           100.00
+      4    Month         0            0.00           100.00
+      5      Day         0            0.00           100.00
+      6    Ozone        37           24.18            75.82
+      7    Total        44            4.79            95.21
+
+---
+
+    Code
+      describe_missing(airquality2, sort = TRUE)
+    Output
+         variable n_missing missing_percent complete_percent
+      6     Ozone        37           24.18            75.82
+      1   Solar.R         7            4.58            95.42
+      2      Wind         0            0.00           100.00
+      3      Temp         0            0.00           100.00
+      4     Month         0            0.00           100.00
+      5       Day         0            0.00           100.00
+      11    Total        44            4.79            95.21
+
+---
+
+    Code
+      describe_missing(airquality2, select = "Ozone:Temp")
+    Output
+        variable n_missing missing_percent complete_percent
+      1    Ozone        37           24.18            75.82
+      2      Day         0            0.00           100.00
+      3    Month         0            0.00           100.00
+      4     Temp         0            0.00           100.00
+      5    Total        37            6.05            93.95
+
+---
+
+    Code
+      describe_missing(airquality2, exclude = "Ozone:Temp")
+    Output
+        variable n_missing missing_percent complete_percent
+      1  Solar.R         7            4.58            95.42
+      2     Wind         0            0.00           100.00
+      3    Total         7            2.29            97.71
+
+---
+
+    Code
+      describe_missing(df_long, select = -c(1, 3), by = "dimension")
+    Output
+             variable n_missing missing_percent complete_percent
+      1 agreeableness        10           23.81            76.19
+      2  extroversion        17           40.48            59.52
+      3      openness        11           26.19            73.81
+      4         Total        38           15.08            84.92
+