From b1d6f8e120bd53586888ae4f9cf29e0c4fd14a90 Mon Sep 17 00:00:00 2001
From: hrbrmstr <bob@rud.is>
Date: Wed, 19 Jun 2019 08:57:10 -0400
Subject: [PATCH 1/3] added `download_file()`; fixed missing word in
 `write_disk()` docs

---
 NAMESPACE                           |   1 +
 NEWS.md                             |   3 +
 R/download-file.R                   | 155 ++++++++++++++++++++++++++++
 R/write-function.R                  |   6 +-
 man/download_file.Rd                |  69 +++++++++++++
 man/httr-package.Rd                 |   1 +
 man/write_disk.Rd                   |   2 +-
 tests/testthat/test-download-file.R |  51 +++++++++
 8 files changed, 284 insertions(+), 4 deletions(-)
 create mode 100644 R/download-file.R
 create mode 100644 man/download_file.Rd
 create mode 100644 tests/testthat/test-download-file.R

diff --git a/NAMESPACE b/NAMESPACE
index a3821b9b..8a257e75 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -51,6 +51,7 @@ export(content_type_json)
 export(content_type_xml)
 export(cookies)
 export(curl_docs)
+export(download_file)
 export(get_callback)
 export(guess_media)
 export(handle)
diff --git a/NEWS.md b/NEWS.md
index d3e26842..01d3621c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -9,6 +9,9 @@
 * `RETRY()` now throws the correct error message if an error occurs during the 
   request (@austin3dickey, #581).
 
+* New `download_file()` function to support cache-aware single and multi-URL
+  file downloads. (@hrbrmstr)
+
 # httr 1.4.0
 
 ## OAuth
diff --git a/R/download-file.R b/R/download-file.R
new file mode 100644
index 00000000..7e5bb4a1
--- /dev/null
+++ b/R/download-file.R
@@ -0,0 +1,155 @@
+possibly <- function(.f, otherwise, quiet = TRUE) {
+  force(otherwise)
+  function(...) {
+    tryCatch(
+      .f(...),
+      error = function(e) {
+        if (!quiet)
+          message("Error: ", e$message)
+        otherwise
+      },
+      interrupt = function(e) {
+        stop("Terminated by user", call. = FALSE)
+      }
+    )
+  }
+}
+
+safe_GET <- possibly(GET, NULL, quiet = TRUE)
+
+#' Download file from the Internet (cache-aware)
+#'
+#' This is an alternative to [utils::download.file()] and a convenience wrapper for
+#' [GET()] + [httr::write_disk()] to perform file downloads.
+#'
+#' Since this function uses [GET()], callers can pass in `httr` configuration
+#' options to customize the behaviour of the download process (e.g. specify a `User-Agent` via
+#' [user_agent()], set proxy config via [use_proxy()], etc.).
+#'
+#' The function is also "cache-aware" in the sense that you deliberately have to specify
+#' `overwrite = TRUE` to force a re-download. This has the potential to save bandwidth
+#' of both the caller and the site hosting files for download.
+#'
+#' @note While this function supports specifying multiple URLs and download paths it
+#'       does not perform concurrent downloads.
+#' @param url the url(s) of the file to retrieve. If multiple URLs are provided then the same
+#'        number of [path]s must also be provided.
+#' @param path Path(s) to save content to. If more than one `path` is specified then the same
+#'        number of [url]s must also be provided. THis parameter will be [path.expand()]ed.
+#' @param overwrite	Will only overwrite existing path if `TRUE`.
+#' @param ... passed on to [GET()]
+#' @return a data frame containing the `url`(s), `path`(s), cache status, and HTTP status code(s).
+#'         If there was an error downloading a file the path, status code, and HTTP status
+#'         columns will be `NA`. If the file was now re-downloaded the status code will be 399
+#' @seealso [GET()]; [write_disk()]
+#' @export
+#' @examples
+#' tmp1 <- tempfile()
+#' tmp2 <- tempfile()
+#' tmp3 <- tempfile()
+#'
+#' download_file("https://google.com", tmp1) # downloads fine
+#' download_file("https://google.com", tmp1) # doesn't re-download since it's cached
+#' download_file("https://google.com", tmp1, overwrite = TRUE) # re-downloads (overwrites file)
+#' download_file("https://google.com", tmp2) # re-downloads (new file)
+#' download_file("badurl", tmp3)) # handles major errors gracefully
+#'
+#' # multi-file example with no-caching
+#' download_file(
+#'   c(rep("https://google.com", 2), "badurl"),
+#'   c(tmp1, tmp2, tmp3),
+#'   overwrite = TRUE
+#' )
+#'
+#' # multi-file example with caching
+#' download_file(
+#'   c(rep("https://google.com", 2), "badurl"),
+#'   c(tmp1, tmp2, tmp3),
+#'   overwrite = FALSE
+#' )
+download_file <- function(url, path, overwrite = FALSE, ...) {
+
+  url <- as.character(url)
+  path <- as.character(path)
+
+  if (length(url) != length(path)) {
+    stop("The lengths of the 'url' and 'path' parameters must be equal.", call.=FALSE)
+  }
+
+  path <- path.expand(path)
+
+  overwrite <- as.logical(overwrite)
+  stopifnot(length(overwrite) == 1)
+
+  out <- vector("list", length = length(url))
+
+  for (idx in seq_along(url)) {
+
+    u <- url[[idx]]
+    p <- path[[idx]]
+
+    if (file.exists(p)) {
+
+      if (overwrite) { # file exists but caller wants to re-download
+        res <- safe_GET(u, write_disk(p, overwrite = TRUE), ...)
+        if (is.null(res)) {
+          p <- NA_character_
+          cache_used = FALSE
+          status <- NA_integer_
+        } else {
+          cache_used <-  FALSE
+          status <- status_code(res)
+        }
+      } else { # file exists but caller does not want to re-download
+        if (is.null(parse_url(u)[["hostname"]])) { # quick non-network test for invalid URL
+          p <- NA_character_
+          cache_used = FALSE
+          status <- NA_integer_
+        } else {
+          cache_used <- TRUE
+          status <- 399L
+        }
+      }
+
+    } else { # file does not exist, so do the thing
+
+      res <- safe_GET(u, write_disk(p, overwrite = overwrite), ...)
+
+      if (is.null(res)) {
+        p <- NA_character_
+        cache_used <- FALSE
+        status <- NA_integer_
+      } else {
+        status <- status_code(res)
+        cache_used <- FALSE
+      }
+
+    }
+
+    out[[idx]] <- data.frame(
+      url = u, path = p,
+      status_code = status,
+      cache_used = cache_used,
+      stringsAsFactors = FALSE
+    )
+
+  }
+
+  out <- do.call(rbind.data.frame, out)
+  class(out) <- c("tbl_df", "tbl", "data.frame")
+
+  invisible(out)
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/R/write-function.R b/R/write-function.R
index e63e6e10..5299e353 100644
--- a/R/write-function.R
+++ b/R/write-function.R
@@ -17,17 +17,17 @@ write_function <- function(subclass, ...) {
 #' it avoids a round-trip to disk. If you want to save a file that's bigger
 #' than memory, use `write_disk()` to save it to a known path.
 #'
-#' @param path Path to content to.
+#' @param path Path to save content to.
 #' @param overwrite Will only overwrite existing `path` if TRUE.
 #' @export
 #' @examples
 #' tmp <- tempfile()
 #' r1 <- GET("https://www.google.com", write_disk(tmp))
 #' readLines(tmp)
-#' 
+#'
 #' # The default
 #' r2 <- GET("https://www.google.com", write_memory())
-#' 
+#'
 #' # Save a very large file
 #' \dontrun{
 #' GET(
diff --git a/man/download_file.Rd b/man/download_file.Rd
new file mode 100644
index 00000000..eaf1da06
--- /dev/null
+++ b/man/download_file.Rd
@@ -0,0 +1,69 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/download-file.R
+\name{download_file}
+\alias{download_file}
+\title{Download file from the Internet (cache-aware)}
+\usage{
+download_file(url, path, overwrite = FALSE, ...)
+}
+\arguments{
+\item{url}{the url(s) of the file to retrieve. If multiple URLs are provided then the same
+number of \link{path}s must also be provided.}
+
+\item{path}{Path(s) to save content to. If more than one \code{path} is specified then the same
+number of \link{url}s must also be provided. THis parameter will be \code{\link[=path.expand]{path.expand()}}ed.}
+
+\item{overwrite}{Will only overwrite existing path if \code{TRUE}.}
+
+\item{...}{passed on to \code{\link[=GET]{GET()}}}
+}
+\value{
+a data frame containing the \code{url}(s), \code{path}(s), cache status, and HTTP status code(s).
+If there was an error downloading a file the path, status code, and HTTP status
+columns will be \code{NA}. If the file was now re-downloaded the status code will be 399
+}
+\description{
+This is an alternative to \code{\link[utils:download.file]{utils::download.file()}} and a convenience wrapper for
+\code{\link[=GET]{GET()}} + \code{\link[httr:write_disk]{httr::write_disk()}} to perform file downloads.
+}
+\details{
+Since this function uses \code{\link[=GET]{GET()}}, callers can pass in \code{httr} configuration
+options to customize the behaviour of the download process (e.g. specify a \code{User-Agent} via
+\code{\link[=user_agent]{user_agent()}}, set proxy config via \code{\link[=use_proxy]{use_proxy()}}, etc.).
+
+The function is also "cache-aware" in the sense that you deliberately have to specify
+\code{overwrite = TRUE} to force a re-download. This has the potential to save bandwidth
+of both the caller and the site hosting files for download.
+}
+\note{
+While this function supports specifying multiple URLs and download paths it
+does not perform concurrent downloads.
+}
+\examples{
+tmp1 <- tempfile()
+tmp2 <- tempfile()
+tmp3 <- tempfile()
+
+download_file("https://google.com", tmp1) # downloads fine
+download_file("https://google.com", tmp1) # doesn't re-download since it's cached
+download_file("https://google.com", tmp1, overwrite = TRUE) # re-downloads (overwrites file)
+download_file("https://google.com", tmp2) # re-downloads (new file)
+download_file("badurl", tmp3)) # handles major errors gracefully
+
+# multi-file example with no-caching
+download_file(
+  c(rep("https://google.com", 2), "badurl"),
+  c(tmp1, tmp2, tmp3),
+  overwrite = TRUE
+)
+
+# multi-file example with caching
+download_file(
+  c(rep("https://google.com", 2), "badurl"),
+  c(tmp1, tmp2, tmp3),
+  overwrite = FALSE
+)
+}
+\seealso{
+\code{\link[=GET]{GET()}}; \code{\link[=write_disk]{write_disk()}}
+}
diff --git a/man/httr-package.Rd b/man/httr-package.Rd
index ffab2a8e..8535d5ea 100644
--- a/man/httr-package.Rd
+++ b/man/httr-package.Rd
@@ -34,6 +34,7 @@ github, google, linkedin, reddit, yahoo, and yelp).
 \seealso{
 Useful links:
 \itemize{
+  \item \url{https://httr.r-lib.org/}
   \item \url{https://github.com/r-lib/httr}
   \item Report bugs at \url{https://github.com/r-lib/httr/issues}
 }
diff --git a/man/write_disk.Rd b/man/write_disk.Rd
index 993e8b0b..9455bbad 100644
--- a/man/write_disk.Rd
+++ b/man/write_disk.Rd
@@ -10,7 +10,7 @@ write_disk(path, overwrite = FALSE)
 write_memory()
 }
 \arguments{
-\item{path}{Path to content to.}
+\item{path}{Path to save content to.}
 
 \item{overwrite}{Will only overwrite existing \code{path} if TRUE.}
 }
diff --git a/tests/testthat/test-download-file.R b/tests/testthat/test-download-file.R
new file mode 100644
index 00000000..d6ac1f1f
--- /dev/null
+++ b/tests/testthat/test-download-file.R
@@ -0,0 +1,51 @@
+context("test-download-file")
+
+test_that("download_file length 1 ops behave as expected", {
+
+  tmp1 <- tempfile()
+  tmp2 <- tempfile()
+  tmp3 <- tempfile()
+
+  res <- download_file("https://httpbin.org", tmp1)
+  expect_equal(res$status_code[[1]], 200L)
+  expect_false(res$cache_used[[1]])
+
+  res <- download_file("https://httpbin.org", tmp1)
+  expect_equal(res$status_code[[1]], 399L)
+  expect_true(res$cache_used[[1]])
+
+  res <- download_file("https://httpbin.org", tmp1, overwrite = TRUE)
+  expect_equal(res$status_code[[1]], 200L)
+  expect_false(res$cache_used[[1]])
+
+  res <- download_file("badurl", tmp3)
+  expect_true(is.na(res$status_code[[1]]))
+  expect_false(res$cache_used[[1]])
+
+})
+
+test_that("download_file multi-file ops behave as expected", {
+
+  tmp1 <- tempfile()
+  tmp2 <- tempfile()
+  tmp3 <- tempfile()
+
+  res <- download_file(
+    c(rep("https://google.com", 2), "badurl"),
+    c(tmp1, tmp2, tmp3),
+    overwrite = TRUE
+  )
+
+  expect_identical(res$status_code, c(200L, 200L, NA))
+  expect_identical(res$cache_used, c(FALSE, FALSE, FALSE))
+
+  res <- download_file(
+    c(rep("https://google.com", 2), "badurl"),
+    c(tmp1, tmp2, tmp3),
+    overwrite = FALSE
+  )
+
+  expect_identical(res$status_code, c(399L, 399L, NA))
+  expect_identical(res$cache_used, c(TRUE, TRUE, FALSE))
+
+})

From 55ca7da493aa2da089159baa313e2f00eaa55fd0 Mon Sep 17 00:00:00 2001
From: hrbrmstr <bob@rud.is>
Date: Wed, 19 Jun 2019 09:10:28 -0400
Subject: [PATCH 2/3] fixed example in `download_file()`

---
 R/download-file.R    | 2 +-
 man/download_file.Rd | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/download-file.R b/R/download-file.R
index 7e5bb4a1..b8c22bed 100644
--- a/R/download-file.R
+++ b/R/download-file.R
@@ -52,7 +52,7 @@ safe_GET <- possibly(GET, NULL, quiet = TRUE)
 #' download_file("https://google.com", tmp1) # doesn't re-download since it's cached
 #' download_file("https://google.com", tmp1, overwrite = TRUE) # re-downloads (overwrites file)
 #' download_file("https://google.com", tmp2) # re-downloads (new file)
-#' download_file("badurl", tmp3)) # handles major errors gracefully
+#' download_file("badurl", tmp3) # handles major errors gracefully
 #'
 #' # multi-file example with no-caching
 #' download_file(
diff --git a/man/download_file.Rd b/man/download_file.Rd
index eaf1da06..72538544 100644
--- a/man/download_file.Rd
+++ b/man/download_file.Rd
@@ -48,7 +48,7 @@ download_file("https://google.com", tmp1) # downloads fine
 download_file("https://google.com", tmp1) # doesn't re-download since it's cached
 download_file("https://google.com", tmp1, overwrite = TRUE) # re-downloads (overwrites file)
 download_file("https://google.com", tmp2) # re-downloads (new file)
-download_file("badurl", tmp3)) # handles major errors gracefully
+download_file("badurl", tmp3) # handles major errors gracefully
 
 # multi-file example with no-caching
 download_file(

From d27810850abfea8482f92044fffbd81a8363d019 Mon Sep 17 00:00:00 2001
From: hrbrmstr <bob@rud.is>
Date: Wed, 19 Jun 2019 09:12:23 -0400
Subject: [PATCH 3/3] tweaked docs a bit in `download_file()`

---
 R/download-file.R    | 4 ++--
 man/download_file.Rd | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/download-file.R b/R/download-file.R
index b8c22bed..75c12b5b 100644
--- a/R/download-file.R
+++ b/R/download-file.R
@@ -33,9 +33,9 @@ safe_GET <- possibly(GET, NULL, quiet = TRUE)
 #' @note While this function supports specifying multiple URLs and download paths it
 #'       does not perform concurrent downloads.
 #' @param url the url(s) of the file to retrieve. If multiple URLs are provided then the same
-#'        number of [path]s must also be provided.
+#'        number of `path`s must also be provided.
 #' @param path Path(s) to save content to. If more than one `path` is specified then the same
-#'        number of [url]s must also be provided. THis parameter will be [path.expand()]ed.
+#'        number of `url`s must also be provided. THis parameter will be [path.expand()]ed.
 #' @param overwrite	Will only overwrite existing path if `TRUE`.
 #' @param ... passed on to [GET()]
 #' @return a data frame containing the `url`(s), `path`(s), cache status, and HTTP status code(s).
diff --git a/man/download_file.Rd b/man/download_file.Rd
index 72538544..ec9c145a 100644
--- a/man/download_file.Rd
+++ b/man/download_file.Rd
@@ -8,10 +8,10 @@ download_file(url, path, overwrite = FALSE, ...)
 }
 \arguments{
 \item{url}{the url(s) of the file to retrieve. If multiple URLs are provided then the same
-number of \link{path}s must also be provided.}
+number of \code{path}s must also be provided.}
 
 \item{path}{Path(s) to save content to. If more than one \code{path} is specified then the same
-number of \link{url}s must also be provided. THis parameter will be \code{\link[=path.expand]{path.expand()}}ed.}
+number of \code{url}s must also be provided. THis parameter will be \code{\link[=path.expand]{path.expand()}}ed.}
 
 \item{overwrite}{Will only overwrite existing path if \code{TRUE}.}