Skip to content

Commit

Permalink
Support writing linear-tsv-style with escape="sep"
Browse files Browse the repository at this point in the history
  • Loading branch information
dvg-p4 committed Feb 15, 2024
1 parent 788df43 commit fd3a60c
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 12 deletions.
26 changes: 23 additions & 3 deletions R/vroom_write.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#' @param escape The type of escape to use when quotes are in the data.
#' - `double` - quotes are escaped by doubling them.
#' - `backslash` - quotes are escaped by a preceding backslash.
#' - `sep` - tabs, newlines, and backslashes are escaped as `\t`, `\n`, and `\\`
#' - `none` - quotes are not escaped.
#' @param quote How to handle fields which contain characters that need to be
#' quoted.
Expand Down Expand Up @@ -37,7 +38,7 @@
#' # vroom_write(mtcars, "mtcars.tsv.xz")
vroom_write <- function(x, file, delim = '\t', eol = "\n", na = "NA", col_names = !append,
append = FALSE, quote = c("needed", "all", "none"), escape =
c("double", "backslash", "none"), bom = FALSE, num_threads =
c("double", "backslash", "sep", "none"), bom = FALSE, num_threads =
vroom_threads(), progress = vroom_progress(), path = deprecated()) {

if (lifecycle::is_present(path)) {
Expand All @@ -53,6 +54,15 @@ vroom_write <- function(x, file, delim = '\t', eol = "\n", na = "NA", col_names
quote <- match.arg(quote)
escape <- match.arg(escape)

if (escape == "sep") {
if (!all(c(delim, eol) %in% c("\t", "\n", "\r", "\r\n"))) {
stop("Can only escape separators `\\t`, `\\n`, and `\\r`")
}
if (quote != "none") {
warning("quotes in data will not be escaped with `escape = sep`")
}
}

opts <- get_vroom_write_opts(quote, escape, bom)

# Standardise path returns a list, but we will only ever have 1 output file.
Expand Down Expand Up @@ -109,7 +119,8 @@ vroom_write_opts <- function() c(
"quote_all" = 2L,
"escape_double" = 4L,
"escape_backslash" = 8L,
"bom" = 16L
"bom" = 16L,
"escape_sep" = 32L
)

#' Convert a data frame to a delimited string
Expand All @@ -121,7 +132,7 @@ vroom_write_opts <- function() c(
#' @inheritParams vroom_write
#' @export
vroom_format <- function(x, delim = "\t", eol = "\n", na = "NA", col_names = TRUE,
escape = c("double", "backslash", "none"),
escape = c("double", "backslash", "sep", "none"),
quote = c("needed", "all", "none"),
bom = FALSE,
num_threads = vroom_threads()) {
Expand All @@ -135,6 +146,15 @@ vroom_format <- function(x, delim = "\t", eol = "\n", na = "NA", col_names = TRU
quote <- match.arg(quote)
escape <- match.arg(escape)

if (escape == "sep") {
if (!all(c(delim, eol) %in% c("\t", "\n", "\r", "\r\n"))) {
stop("Can only escape separators `\\t`, `\\n`, and `\\r`")
}
if (quote != "none") {
warning("quotes in data will not be escaped with `escape = sep`")
}
}

opts <- get_vroom_write_opts(quote, escape, bom)

# This seems to work ok in practice
Expand Down
44 changes: 35 additions & 9 deletions src/vroom_write.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ typedef enum {
quote_all = 2,
escape_double = 4,
escape_backslash = 8,
bom = 16
bom = 16,
escape_sep = 32
} vroom_write_opt_t;

size_t get_buffer_size(
Expand Down Expand Up @@ -135,16 +136,41 @@ void str_to_buf(
}

auto end = str_p + len;
bool should_escape = options & (escape_double | escape_backslash);
auto escape =
options & escape_double ? '"' : options & escape_backslash ? '\\' : '\0';

buf.reserve(buf.size() + len);
while (str_p < end) {
if (should_escape && *str_p == '"') {
buf.push_back(escape);

if (options & escape_sep) {
while (str_p < end) {
if (*str_p == '\t') {
buf.push_back('\\');
buf.push_back('t');
++str_p;
} else if (*str_p == '\n') {
buf.push_back('\\');
buf.push_back('n');
++str_p;
} else if (*str_p == '\r') {
buf.push_back('\\');
buf.push_back('r');
++str_p;
} else if (*str_p == '\\') {
buf.push_back('\\');
buf.push_back('\\');
++str_p;
} else {
buf.push_back(*str_p++);
}
}
} else {
bool should_escape = options & (escape_double | escape_backslash);
auto escape =
options & escape_double ? '"' : options & escape_backslash ? '\\' : '\0';

while (str_p < end) {
if (should_escape && *str_p == '"') {
buf.push_back(escape);
}
buf.push_back(*str_p++);
}
buf.push_back(*str_p++);
}

if (should_quote) {
Expand Down

0 comments on commit fd3a60c

Please sign in to comment.