From b1d6f8e120bd53586888ae4f9cf29e0c4fd14a90 Mon Sep 17 00:00:00 2001 From: hrbrmstr Date: Wed, 19 Jun 2019 08:57:10 -0400 Subject: [PATCH 1/3] added `download_file()`; fixed missing word in `write_disk()` docs --- NAMESPACE | 1 + NEWS.md | 3 + R/download-file.R | 155 ++++++++++++++++++++++++++++ R/write-function.R | 6 +- man/download_file.Rd | 69 +++++++++++++ man/httr-package.Rd | 1 + man/write_disk.Rd | 2 +- tests/testthat/test-download-file.R | 51 +++++++++ 8 files changed, 284 insertions(+), 4 deletions(-) create mode 100644 R/download-file.R create mode 100644 man/download_file.Rd create mode 100644 tests/testthat/test-download-file.R diff --git a/NAMESPACE b/NAMESPACE index a3821b9b..8a257e75 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -51,6 +51,7 @@ export(content_type_json) export(content_type_xml) export(cookies) export(curl_docs) +export(download_file) export(get_callback) export(guess_media) export(handle) diff --git a/NEWS.md b/NEWS.md index d3e26842..01d3621c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,9 @@ * `RETRY()` now throws the correct error message if an error occurs during the request (@austin3dickey, #581). +* New `download_file()` function to support cache-aware single and multi-URL + file downloads. (@hrbrmstr) + # httr 1.4.0 ## OAuth diff --git a/R/download-file.R b/R/download-file.R new file mode 100644 index 00000000..7e5bb4a1 --- /dev/null +++ b/R/download-file.R @@ -0,0 +1,155 @@ +possibly <- function(.f, otherwise, quiet = TRUE) { + force(otherwise) + function(...) { + tryCatch( + .f(...), + error = function(e) { + if (!quiet) + message("Error: ", e$message) + otherwise + }, + interrupt = function(e) { + stop("Terminated by user", call. = FALSE) + } + ) + } +} + +safe_GET <- possibly(GET, NULL, quiet = TRUE) + +#' Download file from the Internet (cache-aware) +#' +#' This is an alternative to [utils::download.file()] and a convenience wrapper for +#' [GET()] + [httr::write_disk()] to perform file downloads. +#' +#' Since this function uses [GET()], callers can pass in `httr` configuration +#' options to customize the behaviour of the download process (e.g. specify a `User-Agent` via +#' [user_agent()], set proxy config via [use_proxy()], etc.). +#' +#' The function is also "cache-aware" in the sense that you deliberately have to specify +#' `overwrite = TRUE` to force a re-download. This has the potential to save bandwidth +#' of both the caller and the site hosting files for download. +#' +#' @note While this function supports specifying multiple URLs and download paths it +#' does not perform concurrent downloads. +#' @param url the url(s) of the file to retrieve. If multiple URLs are provided then the same +#' number of [path]s must also be provided. +#' @param path Path(s) to save content to. If more than one `path` is specified then the same +#' number of [url]s must also be provided. THis parameter will be [path.expand()]ed. +#' @param overwrite Will only overwrite existing path if `TRUE`. +#' @param ... passed on to [GET()] +#' @return a data frame containing the `url`(s), `path`(s), cache status, and HTTP status code(s). +#' If there was an error downloading a file the path, status code, and HTTP status +#' columns will be `NA`. If the file was now re-downloaded the status code will be 399 +#' @seealso [GET()]; [write_disk()] +#' @export +#' @examples +#' tmp1 <- tempfile() +#' tmp2 <- tempfile() +#' tmp3 <- tempfile() +#' +#' download_file("https://google.com", tmp1) # downloads fine +#' download_file("https://google.com", tmp1) # doesn't re-download since it's cached +#' download_file("https://google.com", tmp1, overwrite = TRUE) # re-downloads (overwrites file) +#' download_file("https://google.com", tmp2) # re-downloads (new file) +#' download_file("badurl", tmp3)) # handles major errors gracefully +#' +#' # multi-file example with no-caching +#' download_file( +#' c(rep("https://google.com", 2), "badurl"), +#' c(tmp1, tmp2, tmp3), +#' overwrite = TRUE +#' ) +#' +#' # multi-file example with caching +#' download_file( +#' c(rep("https://google.com", 2), "badurl"), +#' c(tmp1, tmp2, tmp3), +#' overwrite = FALSE +#' ) +download_file <- function(url, path, overwrite = FALSE, ...) { + + url <- as.character(url) + path <- as.character(path) + + if (length(url) != length(path)) { + stop("The lengths of the 'url' and 'path' parameters must be equal.", call.=FALSE) + } + + path <- path.expand(path) + + overwrite <- as.logical(overwrite) + stopifnot(length(overwrite) == 1) + + out <- vector("list", length = length(url)) + + for (idx in seq_along(url)) { + + u <- url[[idx]] + p <- path[[idx]] + + if (file.exists(p)) { + + if (overwrite) { # file exists but caller wants to re-download + res <- safe_GET(u, write_disk(p, overwrite = TRUE), ...) + if (is.null(res)) { + p <- NA_character_ + cache_used = FALSE + status <- NA_integer_ + } else { + cache_used <- FALSE + status <- status_code(res) + } + } else { # file exists but caller does not want to re-download + if (is.null(parse_url(u)[["hostname"]])) { # quick non-network test for invalid URL + p <- NA_character_ + cache_used = FALSE + status <- NA_integer_ + } else { + cache_used <- TRUE + status <- 399L + } + } + + } else { # file does not exist, so do the thing + + res <- safe_GET(u, write_disk(p, overwrite = overwrite), ...) + + if (is.null(res)) { + p <- NA_character_ + cache_used <- FALSE + status <- NA_integer_ + } else { + status <- status_code(res) + cache_used <- FALSE + } + + } + + out[[idx]] <- data.frame( + url = u, path = p, + status_code = status, + cache_used = cache_used, + stringsAsFactors = FALSE + ) + + } + + out <- do.call(rbind.data.frame, out) + class(out) <- c("tbl_df", "tbl", "data.frame") + + invisible(out) + +} + + + + + + + + + + + + diff --git a/R/write-function.R b/R/write-function.R index e63e6e10..5299e353 100644 --- a/R/write-function.R +++ b/R/write-function.R @@ -17,17 +17,17 @@ write_function <- function(subclass, ...) { #' it avoids a round-trip to disk. If you want to save a file that's bigger #' than memory, use `write_disk()` to save it to a known path. #' -#' @param path Path to content to. +#' @param path Path to save content to. #' @param overwrite Will only overwrite existing `path` if TRUE. #' @export #' @examples #' tmp <- tempfile() #' r1 <- GET("https://www.google.com", write_disk(tmp)) #' readLines(tmp) -#' +#' #' # The default #' r2 <- GET("https://www.google.com", write_memory()) -#' +#' #' # Save a very large file #' \dontrun{ #' GET( diff --git a/man/download_file.Rd b/man/download_file.Rd new file mode 100644 index 00000000..eaf1da06 --- /dev/null +++ b/man/download_file.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/download-file.R +\name{download_file} +\alias{download_file} +\title{Download file from the Internet (cache-aware)} +\usage{ +download_file(url, path, overwrite = FALSE, ...) +} +\arguments{ +\item{url}{the url(s) of the file to retrieve. If multiple URLs are provided then the same +number of \link{path}s must also be provided.} + +\item{path}{Path(s) to save content to. If more than one \code{path} is specified then the same +number of \link{url}s must also be provided. THis parameter will be \code{\link[=path.expand]{path.expand()}}ed.} + +\item{overwrite}{Will only overwrite existing path if \code{TRUE}.} + +\item{...}{passed on to \code{\link[=GET]{GET()}}} +} +\value{ +a data frame containing the \code{url}(s), \code{path}(s), cache status, and HTTP status code(s). +If there was an error downloading a file the path, status code, and HTTP status +columns will be \code{NA}. If the file was now re-downloaded the status code will be 399 +} +\description{ +This is an alternative to \code{\link[utils:download.file]{utils::download.file()}} and a convenience wrapper for +\code{\link[=GET]{GET()}} + \code{\link[httr:write_disk]{httr::write_disk()}} to perform file downloads. +} +\details{ +Since this function uses \code{\link[=GET]{GET()}}, callers can pass in \code{httr} configuration +options to customize the behaviour of the download process (e.g. specify a \code{User-Agent} via +\code{\link[=user_agent]{user_agent()}}, set proxy config via \code{\link[=use_proxy]{use_proxy()}}, etc.). + +The function is also "cache-aware" in the sense that you deliberately have to specify +\code{overwrite = TRUE} to force a re-download. This has the potential to save bandwidth +of both the caller and the site hosting files for download. +} +\note{ +While this function supports specifying multiple URLs and download paths it +does not perform concurrent downloads. +} +\examples{ +tmp1 <- tempfile() +tmp2 <- tempfile() +tmp3 <- tempfile() + +download_file("https://google.com", tmp1) # downloads fine +download_file("https://google.com", tmp1) # doesn't re-download since it's cached +download_file("https://google.com", tmp1, overwrite = TRUE) # re-downloads (overwrites file) +download_file("https://google.com", tmp2) # re-downloads (new file) +download_file("badurl", tmp3)) # handles major errors gracefully + +# multi-file example with no-caching +download_file( + c(rep("https://google.com", 2), "badurl"), + c(tmp1, tmp2, tmp3), + overwrite = TRUE +) + +# multi-file example with caching +download_file( + c(rep("https://google.com", 2), "badurl"), + c(tmp1, tmp2, tmp3), + overwrite = FALSE +) +} +\seealso{ +\code{\link[=GET]{GET()}}; \code{\link[=write_disk]{write_disk()}} +} diff --git a/man/httr-package.Rd b/man/httr-package.Rd index ffab2a8e..8535d5ea 100644 --- a/man/httr-package.Rd +++ b/man/httr-package.Rd @@ -34,6 +34,7 @@ github, google, linkedin, reddit, yahoo, and yelp). \seealso{ Useful links: \itemize{ + \item \url{https://httr.r-lib.org/} \item \url{https://github.com/r-lib/httr} \item Report bugs at \url{https://github.com/r-lib/httr/issues} } diff --git a/man/write_disk.Rd b/man/write_disk.Rd index 993e8b0b..9455bbad 100644 --- a/man/write_disk.Rd +++ b/man/write_disk.Rd @@ -10,7 +10,7 @@ write_disk(path, overwrite = FALSE) write_memory() } \arguments{ -\item{path}{Path to content to.} +\item{path}{Path to save content to.} \item{overwrite}{Will only overwrite existing \code{path} if TRUE.} } diff --git a/tests/testthat/test-download-file.R b/tests/testthat/test-download-file.R new file mode 100644 index 00000000..d6ac1f1f --- /dev/null +++ b/tests/testthat/test-download-file.R @@ -0,0 +1,51 @@ +context("test-download-file") + +test_that("download_file length 1 ops behave as expected", { + + tmp1 <- tempfile() + tmp2 <- tempfile() + tmp3 <- tempfile() + + res <- download_file("https://httpbin.org", tmp1) + expect_equal(res$status_code[[1]], 200L) + expect_false(res$cache_used[[1]]) + + res <- download_file("https://httpbin.org", tmp1) + expect_equal(res$status_code[[1]], 399L) + expect_true(res$cache_used[[1]]) + + res <- download_file("https://httpbin.org", tmp1, overwrite = TRUE) + expect_equal(res$status_code[[1]], 200L) + expect_false(res$cache_used[[1]]) + + res <- download_file("badurl", tmp3) + expect_true(is.na(res$status_code[[1]])) + expect_false(res$cache_used[[1]]) + +}) + +test_that("download_file multi-file ops behave as expected", { + + tmp1 <- tempfile() + tmp2 <- tempfile() + tmp3 <- tempfile() + + res <- download_file( + c(rep("https://google.com", 2), "badurl"), + c(tmp1, tmp2, tmp3), + overwrite = TRUE + ) + + expect_identical(res$status_code, c(200L, 200L, NA)) + expect_identical(res$cache_used, c(FALSE, FALSE, FALSE)) + + res <- download_file( + c(rep("https://google.com", 2), "badurl"), + c(tmp1, tmp2, tmp3), + overwrite = FALSE + ) + + expect_identical(res$status_code, c(399L, 399L, NA)) + expect_identical(res$cache_used, c(TRUE, TRUE, FALSE)) + +}) From 55ca7da493aa2da089159baa313e2f00eaa55fd0 Mon Sep 17 00:00:00 2001 From: hrbrmstr Date: Wed, 19 Jun 2019 09:10:28 -0400 Subject: [PATCH 2/3] fixed example in `download_file()` --- R/download-file.R | 2 +- man/download_file.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/download-file.R b/R/download-file.R index 7e5bb4a1..b8c22bed 100644 --- a/R/download-file.R +++ b/R/download-file.R @@ -52,7 +52,7 @@ safe_GET <- possibly(GET, NULL, quiet = TRUE) #' download_file("https://google.com", tmp1) # doesn't re-download since it's cached #' download_file("https://google.com", tmp1, overwrite = TRUE) # re-downloads (overwrites file) #' download_file("https://google.com", tmp2) # re-downloads (new file) -#' download_file("badurl", tmp3)) # handles major errors gracefully +#' download_file("badurl", tmp3) # handles major errors gracefully #' #' # multi-file example with no-caching #' download_file( diff --git a/man/download_file.Rd b/man/download_file.Rd index eaf1da06..72538544 100644 --- a/man/download_file.Rd +++ b/man/download_file.Rd @@ -48,7 +48,7 @@ download_file("https://google.com", tmp1) # downloads fine download_file("https://google.com", tmp1) # doesn't re-download since it's cached download_file("https://google.com", tmp1, overwrite = TRUE) # re-downloads (overwrites file) download_file("https://google.com", tmp2) # re-downloads (new file) -download_file("badurl", tmp3)) # handles major errors gracefully +download_file("badurl", tmp3) # handles major errors gracefully # multi-file example with no-caching download_file( From d27810850abfea8482f92044fffbd81a8363d019 Mon Sep 17 00:00:00 2001 From: hrbrmstr Date: Wed, 19 Jun 2019 09:12:23 -0400 Subject: [PATCH 3/3] tweaked docs a bit in `download_file()` --- R/download-file.R | 4 ++-- man/download_file.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/download-file.R b/R/download-file.R index b8c22bed..75c12b5b 100644 --- a/R/download-file.R +++ b/R/download-file.R @@ -33,9 +33,9 @@ safe_GET <- possibly(GET, NULL, quiet = TRUE) #' @note While this function supports specifying multiple URLs and download paths it #' does not perform concurrent downloads. #' @param url the url(s) of the file to retrieve. If multiple URLs are provided then the same -#' number of [path]s must also be provided. +#' number of `path`s must also be provided. #' @param path Path(s) to save content to. If more than one `path` is specified then the same -#' number of [url]s must also be provided. THis parameter will be [path.expand()]ed. +#' number of `url`s must also be provided. THis parameter will be [path.expand()]ed. #' @param overwrite Will only overwrite existing path if `TRUE`. #' @param ... passed on to [GET()] #' @return a data frame containing the `url`(s), `path`(s), cache status, and HTTP status code(s). diff --git a/man/download_file.Rd b/man/download_file.Rd index 72538544..ec9c145a 100644 --- a/man/download_file.Rd +++ b/man/download_file.Rd @@ -8,10 +8,10 @@ download_file(url, path, overwrite = FALSE, ...) } \arguments{ \item{url}{the url(s) of the file to retrieve. If multiple URLs are provided then the same -number of \link{path}s must also be provided.} +number of \code{path}s must also be provided.} \item{path}{Path(s) to save content to. If more than one \code{path} is specified then the same -number of \link{url}s must also be provided. THis parameter will be \code{\link[=path.expand]{path.expand()}}ed.} +number of \code{url}s must also be provided. THis parameter will be \code{\link[=path.expand]{path.expand()}}ed.} \item{overwrite}{Will only overwrite existing path if \code{TRUE}.}