From 7decc11ecdee535d8a2c417e00b964da7cdef540 Mon Sep 17 00:00:00 2001
From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
Date: Sat, 5 Oct 2024 20:23:42 +0200
Subject: [PATCH] docs: mention capability to read parquet files from
 HuggingFace (#1248)

---
 R/io_parquet.R         | 34 +++++++++++++++++++++++++++-------
 man/IO_read_parquet.Rd | 32 +++++++++++++++++++++++++++-----
 man/IO_scan_parquet.Rd | 32 +++++++++++++++++++++++++++-----
 3 files changed, 81 insertions(+), 17 deletions(-)
diff --git a/R/io_parquet.R b/R/io_parquet.R
index 348d1ba5c..58f919ed4 100644
--- a/R/io_parquet.R
+++ b/R/io_parquet.R
@@ -15,8 +15,8 @@
 #' @param use_statistics Use statistics in the parquet file to determine if pages
 #' can be skipped from reading.
 #' @param storage_options Experimental. List of options necessary to scan
-#' parquet files from different cloud storage providers (GCP, AWS, Azure).
-#' See the 'Details' section.
+#' parquet files from different cloud storage providers (GCP, AWS, Azure,
+#' HuggingFace). See the 'Details' section.
 #'
 #' @rdname IO_scan_parquet
 #' @details
@@ -44,11 +44,31 @@
 #' - [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
 #' - [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
 #'
-#' ### Implementation details
-#'
-#' - Currently it is impossible to scan public parquet files from GCP without
-#'   a valid service account. Be sure to always include a service account in the
-#'   `storage_options` argument.
+#' Currently it is impossible to scan public parquet files from GCP without
+#' a valid service account. Be sure to always include a service account in the
+#' `storage_options` argument.
+#'
+#' ## Scanning from HuggingFace
+#'
+#' It is possible to scan data stored on HuggingFace using a path starting with
+#' `hf://`. The `hf://` path format is defined as
+#' `hf://BUCKET/REPOSITORY@REVISION/PATH`, where:
+#'
+#' * BUCKET is one of datasets or spaces
+#' * REPOSITORY is the location of the repository. this is usually in the
+#'   format of username/repo_name. A branch can also be optionally specified by
+#'   appending `@branch`.
+#' * REVISION is the name of the branch (or commit) to use. This is optional
+#'   and defaults to main if not given.
+#' * PATH is a file or directory path, or a glob pattern from the repository
+#'   root.
+#'
+#' A Hugging Face API key can be passed to access private locations using
+#' either of the following methods:
+#' * Passing a token in storage_options to the scan function, e.g.
+#'   `scan_parquet(..., storage_options = list(token = <your HF token>))`
+#' * Setting the HF_TOKEN environment variable, e.g.
+#'   `Sys.setenv(HF_TOKEN = <your HF token>)`.
 #'
 #' @examplesIf requireNamespace("withr", quietly = TRUE)
 #' # Write a Parquet file than we can then import as DataFrame
diff --git a/man/IO_read_parquet.Rd b/man/IO_read_parquet.Rd
index 2d6e07ad9..1c0da7c10 100644
--- a/man/IO_read_parquet.Rd
+++ b/man/IO_read_parquet.Rd
@@ -61,8 +61,8 @@ the final DataFrame into contiguous memory chunks.}
 \item{low_memory}{Reduce memory usage (will yield a lower performance).}
 
 \item{storage_options}{Experimental. List of options necessary to scan
-parquet files from different cloud storage providers (GCP, AWS, Azure).
-See the 'Details' section.}
+parquet files from different cloud storage providers (GCP, AWS, Azure,
+HuggingFace). See the 'Details' section.}
 
 \item{use_statistics}{Use statistics in the parquet file to determine if pages
 can be skipped from reading.}
@@ -103,14 +103,36 @@ here:
 \item \href{https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html}{gcp}
 \item \href{https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html}{azure}
 }
-\subsection{Implementation details}{
-\itemize{
-\item Currently it is impossible to scan public parquet files from GCP without
+
+Currently it is impossible to scan public parquet files from GCP without
 a valid service account. Be sure to always include a service account in the
 \code{storage_options} argument.
 }
+
+\subsection{Scanning from HuggingFace}{
+
+It is possible to scan data stored on HuggingFace using a path starting with
+\verb{hf://}. The \verb{hf://} path format is defined as
+\verb{hf://BUCKET/REPOSITORY@REVISION/PATH}, where:
+\itemize{
+\item BUCKET is one of datasets or spaces
+\item REPOSITORY is the location of the repository. this is usually in the
+format of username/repo_name. A branch can also be optionally specified by
+appending \verb{@branch}.
+\item REVISION is the name of the branch (or commit) to use. This is optional
+and defaults to main if not given.
+\item PATH is a file or directory path, or a glob pattern from the repository
+root.
 }
 
+A Hugging Face API key can be passed to access private locations using
+either of the following methods:
+\itemize{
+\item Passing a token in storage_options to the scan function, e.g.
+\verb{scan_parquet(..., storage_options = list(token = <your HF token>))}
+\item Setting the HF_TOKEN environment variable, e.g.
+\verb{Sys.setenv(HF_TOKEN = <your HF token>)}.
+}
 }
 }
 \examples{
diff --git a/man/IO_scan_parquet.Rd b/man/IO_scan_parquet.Rd
index f4ae9f113..202993bed 100644
--- a/man/IO_scan_parquet.Rd
+++ b/man/IO_scan_parquet.Rd
@@ -61,8 +61,8 @@ the final DataFrame into contiguous memory chunks.}
 \item{low_memory}{Reduce memory usage (will yield a lower performance).}
 
 \item{storage_options}{Experimental. List of options necessary to scan
-parquet files from different cloud storage providers (GCP, AWS, Azure).
-See the 'Details' section.}
+parquet files from different cloud storage providers (GCP, AWS, Azure,
+HuggingFace). See the 'Details' section.}
 
 \item{use_statistics}{Use statistics in the parquet file to determine if pages
 can be skipped from reading.}
@@ -103,14 +103,36 @@ here:
 \item \href{https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html}{gcp}
 \item \href{https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html}{azure}
 }
-\subsection{Implementation details}{
-\itemize{
-\item Currently it is impossible to scan public parquet files from GCP without
+
+Currently it is impossible to scan public parquet files from GCP without
 a valid service account. Be sure to always include a service account in the
 \code{storage_options} argument.
 }
+
+\subsection{Scanning from HuggingFace}{
+
+It is possible to scan data stored on HuggingFace using a path starting with
+\verb{hf://}. The \verb{hf://} path format is defined as
+\verb{hf://BUCKET/REPOSITORY@REVISION/PATH}, where:
+\itemize{
+\item BUCKET is one of datasets or spaces
+\item REPOSITORY is the location of the repository. this is usually in the
+format of username/repo_name. A branch can also be optionally specified by
+appending \verb{@branch}.
+\item REVISION is the name of the branch (or commit) to use. This is optional
+and defaults to main if not given.
+\item PATH is a file or directory path, or a glob pattern from the repository
+root.
 }
 
+A Hugging Face API key can be passed to access private locations using
+either of the following methods:
+\itemize{
+\item Passing a token in storage_options to the scan function, e.g.
+\verb{scan_parquet(..., storage_options = list(token = <your HF token>))}
+\item Setting the HF_TOKEN environment variable, e.g.
+\verb{Sys.setenv(HF_TOKEN = <your HF token>)}.
+}
 }
 }
 \examples{