From 7decc11ecdee535d8a2c417e00b964da7cdef540 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 5 Oct 2024 20:23:42 +0200 Subject: [PATCH] docs: mention capability to read parquet files from HuggingFace (#1248) --- R/io_parquet.R | 34 +++++++++++++++++++++++++++------- man/IO_read_parquet.Rd | 32 +++++++++++++++++++++++++++----- man/IO_scan_parquet.Rd | 32 +++++++++++++++++++++++++++----- 3 files changed, 81 insertions(+), 17 deletions(-) diff --git a/R/io_parquet.R b/R/io_parquet.R index 348d1ba5c..58f919ed4 100644 --- a/R/io_parquet.R +++ b/R/io_parquet.R @@ -15,8 +15,8 @@ #' @param use_statistics Use statistics in the parquet file to determine if pages #' can be skipped from reading. #' @param storage_options Experimental. List of options necessary to scan -#' parquet files from different cloud storage providers (GCP, AWS, Azure). -#' See the 'Details' section. +#' parquet files from different cloud storage providers (GCP, AWS, Azure, +#' HuggingFace). See the 'Details' section. #' #' @rdname IO_scan_parquet #' @details @@ -44,11 +44,31 @@ #' - [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html) #' - [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html) #' -#' ### Implementation details -#' -#' - Currently it is impossible to scan public parquet files from GCP without -#' a valid service account. Be sure to always include a service account in the -#' `storage_options` argument. +#' Currently it is impossible to scan public parquet files from GCP without +#' a valid service account. Be sure to always include a service account in the +#' `storage_options` argument. +#' +#' ## Scanning from HuggingFace +#' +#' It is possible to scan data stored on HuggingFace using a path starting with +#' `hf://`. The `hf://` path format is defined as +#' `hf://BUCKET/REPOSITORY@REVISION/PATH`, where: +#' +#' * BUCKET is one of datasets or spaces +#' * REPOSITORY is the location of the repository. this is usually in the +#' format of username/repo_name. A branch can also be optionally specified by +#' appending `@branch`. +#' * REVISION is the name of the branch (or commit) to use. This is optional +#' and defaults to main if not given. +#' * PATH is a file or directory path, or a glob pattern from the repository +#' root. +#' +#' A Hugging Face API key can be passed to access private locations using +#' either of the following methods: +#' * Passing a token in storage_options to the scan function, e.g. +#' `scan_parquet(..., storage_options = list(token = ))` +#' * Setting the HF_TOKEN environment variable, e.g. +#' `Sys.setenv(HF_TOKEN = )`. #' #' @examplesIf requireNamespace("withr", quietly = TRUE) #' # Write a Parquet file than we can then import as DataFrame diff --git a/man/IO_read_parquet.Rd b/man/IO_read_parquet.Rd index 2d6e07ad9..1c0da7c10 100644 --- a/man/IO_read_parquet.Rd +++ b/man/IO_read_parquet.Rd @@ -61,8 +61,8 @@ the final DataFrame into contiguous memory chunks.} \item{low_memory}{Reduce memory usage (will yield a lower performance).} \item{storage_options}{Experimental. List of options necessary to scan -parquet files from different cloud storage providers (GCP, AWS, Azure). -See the 'Details' section.} +parquet files from different cloud storage providers (GCP, AWS, Azure, +HuggingFace). See the 'Details' section.} \item{use_statistics}{Use statistics in the parquet file to determine if pages can be skipped from reading.} @@ -103,14 +103,36 @@ here: \item \href{https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html}{gcp} \item \href{https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html}{azure} } -\subsection{Implementation details}{ -\itemize{ -\item Currently it is impossible to scan public parquet files from GCP without + +Currently it is impossible to scan public parquet files from GCP without a valid service account. Be sure to always include a service account in the \code{storage_options} argument. } + +\subsection{Scanning from HuggingFace}{ + +It is possible to scan data stored on HuggingFace using a path starting with +\verb{hf://}. The \verb{hf://} path format is defined as +\verb{hf://BUCKET/REPOSITORY@REVISION/PATH}, where: +\itemize{ +\item BUCKET is one of datasets or spaces +\item REPOSITORY is the location of the repository. this is usually in the +format of username/repo_name. A branch can also be optionally specified by +appending \verb{@branch}. +\item REVISION is the name of the branch (or commit) to use. This is optional +and defaults to main if not given. +\item PATH is a file or directory path, or a glob pattern from the repository +root. } +A Hugging Face API key can be passed to access private locations using +either of the following methods: +\itemize{ +\item Passing a token in storage_options to the scan function, e.g. +\verb{scan_parquet(..., storage_options = list(token = ))} +\item Setting the HF_TOKEN environment variable, e.g. +\verb{Sys.setenv(HF_TOKEN = )}. +} } } \examples{ diff --git a/man/IO_scan_parquet.Rd b/man/IO_scan_parquet.Rd index f4ae9f113..202993bed 100644 --- a/man/IO_scan_parquet.Rd +++ b/man/IO_scan_parquet.Rd @@ -61,8 +61,8 @@ the final DataFrame into contiguous memory chunks.} \item{low_memory}{Reduce memory usage (will yield a lower performance).} \item{storage_options}{Experimental. List of options necessary to scan -parquet files from different cloud storage providers (GCP, AWS, Azure). -See the 'Details' section.} +parquet files from different cloud storage providers (GCP, AWS, Azure, +HuggingFace). See the 'Details' section.} \item{use_statistics}{Use statistics in the parquet file to determine if pages can be skipped from reading.} @@ -103,14 +103,36 @@ here: \item \href{https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html}{gcp} \item \href{https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html}{azure} } -\subsection{Implementation details}{ -\itemize{ -\item Currently it is impossible to scan public parquet files from GCP without + +Currently it is impossible to scan public parquet files from GCP without a valid service account. Be sure to always include a service account in the \code{storage_options} argument. } + +\subsection{Scanning from HuggingFace}{ + +It is possible to scan data stored on HuggingFace using a path starting with +\verb{hf://}. The \verb{hf://} path format is defined as +\verb{hf://BUCKET/REPOSITORY@REVISION/PATH}, where: +\itemize{ +\item BUCKET is one of datasets or spaces +\item REPOSITORY is the location of the repository. this is usually in the +format of username/repo_name. A branch can also be optionally specified by +appending \verb{@branch}. +\item REVISION is the name of the branch (or commit) to use. This is optional +and defaults to main if not given. +\item PATH is a file or directory path, or a glob pattern from the repository +root. } +A Hugging Face API key can be passed to access private locations using +either of the following methods: +\itemize{ +\item Passing a token in storage_options to the scan function, e.g. +\verb{scan_parquet(..., storage_options = list(token = ))} +\item Setting the HF_TOKEN environment variable, e.g. +\verb{Sys.setenv(HF_TOKEN = )}. +} } } \examples{