diff --git a/NAMESPACE b/NAMESPACE index a3821b9b..8a257e75 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -51,6 +51,7 @@ export(content_type_json) export(content_type_xml) export(cookies) export(curl_docs) +export(download_file) export(get_callback) export(guess_media) export(handle) diff --git a/NEWS.md b/NEWS.md index d3e26842..01d3621c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,9 @@ * `RETRY()` now throws the correct error message if an error occurs during the request (@austin3dickey, #581). +* New `download_file()` function to support cache-aware single and multi-URL + file downloads. (@hrbrmstr) + # httr 1.4.0 ## OAuth diff --git a/R/download-file.R b/R/download-file.R new file mode 100644 index 00000000..75c12b5b --- /dev/null +++ b/R/download-file.R @@ -0,0 +1,155 @@ +possibly <- function(.f, otherwise, quiet = TRUE) { + force(otherwise) + function(...) { + tryCatch( + .f(...), + error = function(e) { + if (!quiet) + message("Error: ", e$message) + otherwise + }, + interrupt = function(e) { + stop("Terminated by user", call. = FALSE) + } + ) + } +} + +safe_GET <- possibly(GET, NULL, quiet = TRUE) + +#' Download file from the Internet (cache-aware) +#' +#' This is an alternative to [utils::download.file()] and a convenience wrapper for +#' [GET()] + [httr::write_disk()] to perform file downloads. +#' +#' Since this function uses [GET()], callers can pass in `httr` configuration +#' options to customize the behaviour of the download process (e.g. specify a `User-Agent` via +#' [user_agent()], set proxy config via [use_proxy()], etc.). +#' +#' The function is also "cache-aware" in the sense that you deliberately have to specify +#' `overwrite = TRUE` to force a re-download. This has the potential to save bandwidth +#' of both the caller and the site hosting files for download. +#' +#' @note While this function supports specifying multiple URLs and download paths it +#' does not perform concurrent downloads. +#' @param url the url(s) of the file to retrieve. If multiple URLs are provided then the same +#' number of `path`s must also be provided. +#' @param path Path(s) to save content to. If more than one `path` is specified then the same +#' number of `url`s must also be provided. THis parameter will be [path.expand()]ed. +#' @param overwrite Will only overwrite existing path if `TRUE`. +#' @param ... passed on to [GET()] +#' @return a data frame containing the `url`(s), `path`(s), cache status, and HTTP status code(s). +#' If there was an error downloading a file the path, status code, and HTTP status +#' columns will be `NA`. If the file was now re-downloaded the status code will be 399 +#' @seealso [GET()]; [write_disk()] +#' @export +#' @examples +#' tmp1 <- tempfile() +#' tmp2 <- tempfile() +#' tmp3 <- tempfile() +#' +#' download_file("https://google.com", tmp1) # downloads fine +#' download_file("https://google.com", tmp1) # doesn't re-download since it's cached +#' download_file("https://google.com", tmp1, overwrite = TRUE) # re-downloads (overwrites file) +#' download_file("https://google.com", tmp2) # re-downloads (new file) +#' download_file("badurl", tmp3) # handles major errors gracefully +#' +#' # multi-file example with no-caching +#' download_file( +#' c(rep("https://google.com", 2), "badurl"), +#' c(tmp1, tmp2, tmp3), +#' overwrite = TRUE +#' ) +#' +#' # multi-file example with caching +#' download_file( +#' c(rep("https://google.com", 2), "badurl"), +#' c(tmp1, tmp2, tmp3), +#' overwrite = FALSE +#' ) +download_file <- function(url, path, overwrite = FALSE, ...) { + + url <- as.character(url) + path <- as.character(path) + + if (length(url) != length(path)) { + stop("The lengths of the 'url' and 'path' parameters must be equal.", call.=FALSE) + } + + path <- path.expand(path) + + overwrite <- as.logical(overwrite) + stopifnot(length(overwrite) == 1) + + out <- vector("list", length = length(url)) + + for (idx in seq_along(url)) { + + u <- url[[idx]] + p <- path[[idx]] + + if (file.exists(p)) { + + if (overwrite) { # file exists but caller wants to re-download + res <- safe_GET(u, write_disk(p, overwrite = TRUE), ...) + if (is.null(res)) { + p <- NA_character_ + cache_used = FALSE + status <- NA_integer_ + } else { + cache_used <- FALSE + status <- status_code(res) + } + } else { # file exists but caller does not want to re-download + if (is.null(parse_url(u)[["hostname"]])) { # quick non-network test for invalid URL + p <- NA_character_ + cache_used = FALSE + status <- NA_integer_ + } else { + cache_used <- TRUE + status <- 399L + } + } + + } else { # file does not exist, so do the thing + + res <- safe_GET(u, write_disk(p, overwrite = overwrite), ...) + + if (is.null(res)) { + p <- NA_character_ + cache_used <- FALSE + status <- NA_integer_ + } else { + status <- status_code(res) + cache_used <- FALSE + } + + } + + out[[idx]] <- data.frame( + url = u, path = p, + status_code = status, + cache_used = cache_used, + stringsAsFactors = FALSE + ) + + } + + out <- do.call(rbind.data.frame, out) + class(out) <- c("tbl_df", "tbl", "data.frame") + + invisible(out) + +} + + + + + + + + + + + + diff --git a/R/write-function.R b/R/write-function.R index e63e6e10..5299e353 100644 --- a/R/write-function.R +++ b/R/write-function.R @@ -17,17 +17,17 @@ write_function <- function(subclass, ...) { #' it avoids a round-trip to disk. If you want to save a file that's bigger #' than memory, use `write_disk()` to save it to a known path. #' -#' @param path Path to content to. +#' @param path Path to save content to. #' @param overwrite Will only overwrite existing `path` if TRUE. #' @export #' @examples #' tmp <- tempfile() #' r1 <- GET("https://www.google.com", write_disk(tmp)) #' readLines(tmp) -#' +#' #' # The default #' r2 <- GET("https://www.google.com", write_memory()) -#' +#' #' # Save a very large file #' \dontrun{ #' GET( diff --git a/man/download_file.Rd b/man/download_file.Rd new file mode 100644 index 00000000..ec9c145a --- /dev/null +++ b/man/download_file.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/download-file.R +\name{download_file} +\alias{download_file} +\title{Download file from the Internet (cache-aware)} +\usage{ +download_file(url, path, overwrite = FALSE, ...) +} +\arguments{ +\item{url}{the url(s) of the file to retrieve. If multiple URLs are provided then the same +number of \code{path}s must also be provided.} + +\item{path}{Path(s) to save content to. If more than one \code{path} is specified then the same +number of \code{url}s must also be provided. THis parameter will be \code{\link[=path.expand]{path.expand()}}ed.} + +\item{overwrite}{Will only overwrite existing path if \code{TRUE}.} + +\item{...}{passed on to \code{\link[=GET]{GET()}}} +} +\value{ +a data frame containing the \code{url}(s), \code{path}(s), cache status, and HTTP status code(s). +If there was an error downloading a file the path, status code, and HTTP status +columns will be \code{NA}. If the file was now re-downloaded the status code will be 399 +} +\description{ +This is an alternative to \code{\link[utils:download.file]{utils::download.file()}} and a convenience wrapper for +\code{\link[=GET]{GET()}} + \code{\link[httr:write_disk]{httr::write_disk()}} to perform file downloads. +} +\details{ +Since this function uses \code{\link[=GET]{GET()}}, callers can pass in \code{httr} configuration +options to customize the behaviour of the download process (e.g. specify a \code{User-Agent} via +\code{\link[=user_agent]{user_agent()}}, set proxy config via \code{\link[=use_proxy]{use_proxy()}}, etc.). + +The function is also "cache-aware" in the sense that you deliberately have to specify +\code{overwrite = TRUE} to force a re-download. This has the potential to save bandwidth +of both the caller and the site hosting files for download. +} +\note{ +While this function supports specifying multiple URLs and download paths it +does not perform concurrent downloads. +} +\examples{ +tmp1 <- tempfile() +tmp2 <- tempfile() +tmp3 <- tempfile() + +download_file("https://google.com", tmp1) # downloads fine +download_file("https://google.com", tmp1) # doesn't re-download since it's cached +download_file("https://google.com", tmp1, overwrite = TRUE) # re-downloads (overwrites file) +download_file("https://google.com", tmp2) # re-downloads (new file) +download_file("badurl", tmp3) # handles major errors gracefully + +# multi-file example with no-caching +download_file( + c(rep("https://google.com", 2), "badurl"), + c(tmp1, tmp2, tmp3), + overwrite = TRUE +) + +# multi-file example with caching +download_file( + c(rep("https://google.com", 2), "badurl"), + c(tmp1, tmp2, tmp3), + overwrite = FALSE +) +} +\seealso{ +\code{\link[=GET]{GET()}}; \code{\link[=write_disk]{write_disk()}} +} diff --git a/man/httr-package.Rd b/man/httr-package.Rd index ffab2a8e..8535d5ea 100644 --- a/man/httr-package.Rd +++ b/man/httr-package.Rd @@ -34,6 +34,7 @@ github, google, linkedin, reddit, yahoo, and yelp). \seealso{ Useful links: \itemize{ + \item \url{https://httr.r-lib.org/} \item \url{https://github.com/r-lib/httr} \item Report bugs at \url{https://github.com/r-lib/httr/issues} } diff --git a/man/write_disk.Rd b/man/write_disk.Rd index 993e8b0b..9455bbad 100644 --- a/man/write_disk.Rd +++ b/man/write_disk.Rd @@ -10,7 +10,7 @@ write_disk(path, overwrite = FALSE) write_memory() } \arguments{ -\item{path}{Path to content to.} +\item{path}{Path to save content to.} \item{overwrite}{Will only overwrite existing \code{path} if TRUE.} } diff --git a/tests/testthat/test-download-file.R b/tests/testthat/test-download-file.R new file mode 100644 index 00000000..d6ac1f1f --- /dev/null +++ b/tests/testthat/test-download-file.R @@ -0,0 +1,51 @@ +context("test-download-file") + +test_that("download_file length 1 ops behave as expected", { + + tmp1 <- tempfile() + tmp2 <- tempfile() + tmp3 <- tempfile() + + res <- download_file("https://httpbin.org", tmp1) + expect_equal(res$status_code[[1]], 200L) + expect_false(res$cache_used[[1]]) + + res <- download_file("https://httpbin.org", tmp1) + expect_equal(res$status_code[[1]], 399L) + expect_true(res$cache_used[[1]]) + + res <- download_file("https://httpbin.org", tmp1, overwrite = TRUE) + expect_equal(res$status_code[[1]], 200L) + expect_false(res$cache_used[[1]]) + + res <- download_file("badurl", tmp3) + expect_true(is.na(res$status_code[[1]])) + expect_false(res$cache_used[[1]]) + +}) + +test_that("download_file multi-file ops behave as expected", { + + tmp1 <- tempfile() + tmp2 <- tempfile() + tmp3 <- tempfile() + + res <- download_file( + c(rep("https://google.com", 2), "badurl"), + c(tmp1, tmp2, tmp3), + overwrite = TRUE + ) + + expect_identical(res$status_code, c(200L, 200L, NA)) + expect_identical(res$cache_used, c(FALSE, FALSE, FALSE)) + + res <- download_file( + c(rep("https://google.com", 2), "badurl"), + c(tmp1, tmp2, tmp3), + overwrite = FALSE + ) + + expect_identical(res$status_code, c(399L, 399L, NA)) + expect_identical(res$cache_used, c(TRUE, TRUE, FALSE)) + +})