From 3e8bf23632c53183f59f41ebf8c6135da10cda29 Mon Sep 17 00:00:00 2001 From: "Leonardo R. Jorge" Date: Fri, 27 Sep 2024 14:37:33 +0200 Subject: [PATCH] correct ordered factor definition (#1686) Co-authored-by: Hadley Wickham --- factors.qmd | 47 +++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/factors.qmd b/factors.qmd index d0864daf0..565b39060 100644 --- a/factors.qmd +++ b/factors.qmd @@ -56,7 +56,7 @@ To create a factor you must start by creating a list of the valid **levels**: ```{r} month_levels <- c( - "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" ) ``` @@ -169,7 +169,7 @@ relig_summary <- gss_cat |> n = n() ) -ggplot(relig_summary, aes(x = tvhours, y = relig)) + +ggplot(relig_summary, aes(x = tvhours, y = relig)) + geom_point() ``` @@ -212,7 +212,7 @@ What if we create a similar plot looking at how average age varies across report #| fig-alt: | #| A scatterplot with age on the x-axis and income on the y-axis. Income #| has been reordered in order of average age which doesn't make much -#| sense. One section of the y-axis goes from $6000-6999, then <$1000, +#| sense. One section of the y-axis goes from $6000-6999, then <$1000, #| then $8000-9999. rincome_summary <- gss_cat |> group_by(rincome) |> @@ -221,7 +221,7 @@ rincome_summary <- gss_cat |> n = n() ) -ggplot(rincome_summary, aes(x = age, y = fct_reorder(rincome, age))) + +ggplot(rincome_summary, aes(x = age, y = fct_reorder(rincome, age))) + geom_point() ``` @@ -257,15 +257,15 @@ This makes the plot easier to read because the colors of the line at the far rig #| A line plot with age on the x-axis and proportion on the y-axis. #| There is one line for each category of marital status: no answer, #| never married, separated, divorced, widowed, and married. It is -#| a little hard to read the plot because the order of the legend is -#| unrelated to the lines on the plot. Rearranging the legend makes -#| the plot easier to read because the legend colors now match the -#| order of the lines on the far right of the plot. You can see some -#| unsurprising patterns: the proportion never married decreases with -#| age, married forms an upside down U shape, and widowed starts off +#| a little hard to read the plot because the order of the legend is +#| unrelated to the lines on the plot. Rearranging the legend makes +#| the plot easier to read because the legend colors now match the +#| order of the lines on the far right of the plot. You can see some +#| unsurprising patterns: the proportion never married decreases with +#| age, married forms an upside down U shape, and widowed starts off #| low but increases steeply after age 60. by_age <- gss_cat |> - filter(!is.na(age)) |> + filter(!is.na(age)) |> count(age, marital) |> group_by(age) |> mutate( @@ -273,13 +273,13 @@ by_age <- gss_cat |> ) ggplot(by_age, aes(x = age, y = prop, color = marital)) + - geom_line(linewidth = 1) + + geom_line(linewidth = 1) + scale_color_brewer(palette = "Set1") ggplot(by_age, aes(x = age, y = prop, color = fct_reorder2(marital, age, prop))) + geom_line(linewidth = 1) + - scale_color_brewer(palette = "Set1") + - labs(color = "marital") + scale_color_brewer(palette = "Set1") + + labs(color = "marital") ``` Finally, for bar plots, you can use `fct_infreq()` to order levels in decreasing frequency: this is the simplest type of reordering because it doesn't need any extra variables. @@ -288,7 +288,7 @@ Combine it with `fct_rev()` if you want them in increasing frequency so that in ```{r} #| fig-alt: | #| A bar char of marital status ordered from least to most common: -#| no answer (~0), separated (~1,000), widowed (~2,000), divorced +#| no answer (~0), separated (~1,000), widowed (~2,000), divorced #| (~3,000), never married (~5,000), married (~10,000). gss_cat |> mutate(marital = marital |> fct_infreq() |> fct_rev()) |> @@ -409,21 +409,24 @@ Read the documentation to learn about `fct_lump_min()` and `fct_lump_prop()` whi ## Ordered factors {#sec-ordered-factors} -Before we go on, there's a special type of factor that needs to be mentioned briefly: ordered factors. -Ordered factors, created with `ordered()`, imply a strict ordering and equal distance between levels: the first level is "less than" the second level by the same amount that the second level is "less than" the third level, and so on. -You can recognize them when printing because they use `<` between the factor levels: +Before we continue, it's important to briefly mention a special type of factor: ordered factors. +Created with the `ordered()` function, ordered factors imply a strict ordering between levels, but don't specify anything about the magnitude of the differences between the levels. +You use ordered factors when you know there the levels are ranked, but there's no precise numerical ranking. + +You can identify an ordered factor when its printed because it uses `<` symbols between the factor levels: ```{r} ordered(c("a", "b", "c")) ``` - -In practice, `ordered()` factors behave very similarly to regular factors. +In both base R and the tidyverse, ordered factors behave very similarly to regular factors. There are only two places where you might notice different behavior: - If you map an ordered factor to color or fill in ggplot2, it will default to `scale_color_viridis()`/`scale_fill_viridis()`, a color scale that implies a ranking. -- If you use an ordered function in a linear model, it will use "polygonal contrasts". These are mildly useful, but you are unlikely to have heard of them unless you have a PhD in Statistics, and even then you probably don't routinely interpret them. If you want to learn more, we recommend `vignette("contrasts", package = "faux")` by Lisa DeBruine. +- If you use an ordered predictor in a linear model, it will use "polynomial contrasts". These are mildly useful, but you are unlikely to have heard of them unless you have a PhD in Statistics, and even then you probably don't routinely interpret them. If you want to learn more, we recommend `vignette("contrasts", package = "faux")` by Lisa DeBruine. -Given the arguable utility of these differences, we don't generally recommend using ordered factors. +For the purposes of this book, correctly distinguishing between regular and ordered factors is not particularly important. +More broadly, however, certain fields (particularly the social sciences) do use ordered factors extensively. +In these contexts, it's important to correctly identify them so that other analysis packages can offer the appropriate behavior. ## Summary