Merge pull request #243 from JBGruber/master

Rewrite of spacy_install
quanteda · Dec 6, 2023 · 28feb2b · 28feb2b
2 parents f930072 + 80ddb76
commit 28feb2b
Show file tree

Hide file tree

Showing 84 changed files with 4,050 additions and 4,192 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -29,3 +29,7 @@ index.Rmd
 index.md
 installation.Rmd
 installation.md
+\.pem$
+
+^\.github$
+^codecov\.yml$
diff --git a/.github/.gitignore b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -0,0 +1,49 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+name: R-CMD-check
+
+jobs:
+  R-CMD-check:
+    runs-on: ${{ matrix.config.os }}
+
+    name: ${{ matrix.config.os }} (${{ matrix.config.r }})
+
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {os: macos-latest,   r: 'release'}
+          - {os: windows-latest, r: 'release'}
+          - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
+          - {os: ubuntu-latest,   r: 'release'}
+          - {os: ubuntu-latest,   r: 'oldrel-1'}
+
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      R_KEEP_PKG_SOURCE: yes
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: ${{ matrix.config.r }}
+          http-user-agent: ${{ matrix.config.http-user-agent }}
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck
+          needs: check
+
+      - uses: r-lib/actions/check-r-package@v2
+        with:
+          upload-snapshots: true
diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,5 @@ R/unused
 src/spacyr.dll
 CRAN-RELEASE
 revdep
+/doc/
+/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,14 +1,16 @@
 Package: spacyr
 Type: Package
 Title: Wrapper to the 'spaCy' 'NLP' Library
-Version: 1.2.1
+Version: 1.3.0
 Authors@R: c(
     person("Kenneth", "Benoit", email = "[email protected]", role = c("cre", "aut", "cph"), comment = c(ORCID = "0000-0002-0797-564X")), 
     person("Akitaka", "Matsuo", email = "[email protected]", role = "aut", comment = c(ORCID = "0000-0002-3323-6330")),
+    person("Johannes", "Gruber", , "[email protected]", role = "ctb",
+           comment = c(ORCID = "0000-0001-9177-1772")),
     person("European Research Council", role = "fnd", comment = "ERC-2011-StG 283794-QUANTESS")
     )
 Description: An R wrapper to the 'Python' 'spaCy' 'NLP' library,
-    from <http://spacy.io>.
+    from <https://spacy.io>.
 License: GPL-3
 LazyData: TRUE
 Depends:
@@ -30,6 +32,7 @@ Suggests:
 URL: https://spacyr.quanteda.io
 Encoding: UTF-8
 BugReports: https://github.com/quanteda/spacyr/issues
-RoxygenNote: 7.1.2
+RoxygenNote: 7.2.3
 Language: en-GB
 VignetteBuilder: R.rsp
+Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
@@ -15,7 +15,6 @@ S3method(spacy_tokenize,data.frame)
 export(entity_consolidate)
 export(entity_extract)
 export(find_spacy)
-export(find_spacy_env)
 export(get_attrs)
 export(get_dependency)
 export(get_named_entities)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# spacyr v1.3
+
+* Overhauled the `spacy_install()` and `spacy_uninstall()` functions to provide a more modern and robust installation procedure.
+
 
 # spacyr v1.2
 

diff --git a/R/data.R b/R/data.R
@@ -1,12 +1,12 @@
 #' An R wrapper to the spaCy NLP system
 #' 
 #' An R wrapper to the Python (Cython) spaCy NLP system, from
-#' \url{http://spacy.io}. Nicely integrated with \pkg{quanteda}.  \pkg{spacyr}
+#' <https://spacy.io>. Nicely integrated with \pkg{quanteda}.  \pkg{spacyr}
 #' is designed to provide easy access to the powerful functionality of spaCy, in
 #' a simple format.
 #' 
 #' 
-#' @references \url{https://spacy.io}, \url{https://spacyr.quanteda.io}.
+#' @references <https://spacy.io>, <https://spacyr.quanteda.io>.
 #' @author Ken Benoit and Akitaka Matsuo
 "_PACKAGE"
 

diff --git a/R/entity-functions.R b/R/entity-functions.R
@@ -1,25 +1,22 @@
 #' Extract or consolidate entities from parsed documents
 #' 
-#' From an object parsed by \code{\link{spacy_parse}}, extract the entities as a
+#' From an object parsed by [spacy_parse()], extract the entities as a
 #' separate object, or convert the multi-word entities into single "token"
 #' consisting of the concatenated elements of the multi-word entities.
-#' @param x output from \code{\link{spacy_parse}}.
-#' @param type type of named entities, either \code{named}, \code{extended}, or 
-#'   \code{all}.  See 
-#'   \url{https://spacy.io/docs/usage/entity-recognition#entity-types} for 
+#' @param x output from [spacy_parse()].
+#' @param type type of named entities, either `named`, `extended`, or 
+#'   `all`.  See 
+#'   <https://spacy.io/docs/usage/entity-recognition#entity-types> for 
 #'   details.
-#' @return \code{entity_extract} returns a \code{data.frame} of all named
+#' @returns `entity_extract()` returns a data.frame of all named
 #'   entities, containing the following fields: 
-#'   \itemize{
-#'   \item{\code{doc_id}}{ name of the document containing the entity} 
-#'   \item{\code{sentence_id}}{ the sentence ID containing the entity, within the document}
-#'   \item{\code{entity}}{ the named entity}
-#'   \item{\code{entity_type}}{ type of named entities (e.g. PERSON, ORG, PERCENT,
-#'   etc.)} 
-#'   }
+#'   * `doc_id` name of the document containing the entity
+#'   * `sentence_id` the sentence ID containing the entity, within the document
+#'   * `entity` the named entity
+#'   * `entity_type` the type of named entities (e.g. PERSON, ORG, PERCENT, etc.)
 #' @importFrom data.table data.table as.data.table
 #' @examples
-#' \donttest{
+#' \dontrun{
 #' spacy_initialize()
 #' 
 #' # entity extraction
@@ -70,13 +67,13 @@ entity_extract.spacyr_parsed <- function(x, type = c("named", "extended", "all")
 #' @rdname entity_extract
 #' @param concatenator the character(s) used to join the elements of multi-word
 #'   named entities
-#' @return \code{entity_consolidate} returns a modified \code{data.frame} of
+#' @return `entity_consolidate` returns a modified `data.frame` of
 #'   parsed results, where the named entities have been combined into a single
 #'   "token".  Currently, dependency parsing is removed when this consolidation
 #'   occurs.
 #' @importFrom data.table data.table
 #' @examples
-#' \donttest{
+#' \dontrun{
 #' # consolidating multi-word entities 
 #' txt <- "The House of Representatives voted to suspend aid to South Dakota."
 #' parsed <- spacy_parse(txt, entity = TRUE)

diff --git a/R/nounphrase-functions.R b/R/nounphrase-functions.R
@@ -1,24 +1,21 @@
 #' Extract or consolidate noun phrases from parsed documents
 #' 
-#' From an object parsed by \code{\link{spacy_parse}}, extract the multi-word
+#' From an object parsed by [spacy_parse()], extract the multi-word
 #' noun phrases as a separate object, or convert the multi-word noun phrases
 #' into single "token" consisting of the concatenated elements of the multi-word
 #' noun phrases.
-#' @param x output from \code{\link{spacy_parse}}
+#' @param x output from [spacy_parse()]
 #' @param concatenator the character(s) used to join elements of multi-word
 #'   noun phrases
-#' @return \code{noun} returns a \code{data.frame} of all named
+#' @return `noun` returns a `data.frame` of all named
 #'   entities, containing the following fields: 
-#'   \itemize{
-#'   \item{\code{doc_id}}{ name of the document containing the noun phrase}
-#'   \item{\code{sentence_id}}{ the sentence ID containing the noun phrase,
-#'   within the document}
-#'   \item{\code{nounphrase}}{the noun phrase}
-#'   \item{\code{root}}{ the root token of the noun phrase}
-#'   }
+#'   * `doc_id` name of the document containing the noun phrase
+#'   * `sentence_id` the sentence ID containing the noun phrase, within the document
+#'   * `nounphrase` the noun phrase
+#'   * `root` the root token of the noun phrase
 #' @importFrom data.table data.table as.data.table
 #' @examples
-#' \donttest{
+#' \dontrun{
 #' spacy_initialize()
 #' 
 #' # entity extraction
@@ -66,13 +63,13 @@ nounphrase_extract.spacyr_parsed <- function(x, concatenator = "_") {
 
 
 #' @rdname nounphrase_extract
-#' @return \code{nounphrase_consolidate} returns a modified \code{data.frame} of
+#' @return `nounphrase_consolidate` returns a modified `data.frame` of
 #'   parsed results, where the noun phrases have been combined into a single
 #'   "token".  Currently, dependency parsing is removed when this consolidation
 #'   occurs.
 #' @importFrom data.table data.table
 #' @examples
-#' \donttest{
+#' \dontrun{
 #' # consolidating multi-word noun phrases
 #' txt <- "The House of Representatives voted to suspend aid to South Dakota."
 #' parsed <- spacy_parse(txt, nounphrase = TRUE)

diff --git a/R/parse-extractor-functions.R b/R/parse-extractor-functions.R
@@ -21,9 +21,9 @@ spacy_out <- setRefClass(
 
 #' get functions for spaCy
 #' 
-#' A collection of get methods for spacyr return objects (of \code{spacy_out} class).
+#' A collection of get methods for spacyr return objects (of `spacy_out` class).
 #' @param spacy_out a spacy_out object
-#' @return \code{get_tokens} returns a data.frame of tokens from spaCy.
+#' @return `get_tokens` returns a data.frame of tokens from spaCy.
 #' @export
 #' @name get-functions
 #' @keywords internal
@@ -35,15 +35,15 @@ get_tokens <- function(spacy_out) {
 }
 
 #' @rdname get-functions
-#' @return \code{get_tags} returns a tokenized text object with part-of-speech tags.
+#' @return `get_tags` returns a tokenized text object with part-of-speech tags.
 #' Options exist for using either the Google or Detailed tagsets. See 
-#' \url{http://spacy.io}.
-#' @param tagset character label for the tagset to use, either \code{"google"} 
-#'   or \code{"detailed"} to use the simplified Google tagset, or the more detailed 
+#' <https://spacy.io>.
+#' @param tagset character label for the tagset to use, either `"google"` 
+#'   or `"detailed"` to use the simplified Google tagset, or the more detailed 
 #'   scheme from the Penn Treebank (or the German Text Archive in case of German language model).  
 #' @export 
 #' @examples
-#' \donttest{
+#' \dontrun{
 #' # get_tags examples
 #' txt <- c(text1 = "This is the first sentence.\nHere is the second sentence.", 
 #'          text2 = "This is the second document.")
@@ -69,7 +69,7 @@ get_tags <- function(spacy_out, tagset = c("google", "detailed")) {
 
 #' @rdname get-functions
 #' @param attr_name name of spaCy token attributes to extract
-#' @return \code{get_attrs} returns a list of attributes from spaCy output
+#' @return `get_attrs` returns a list of attributes from spaCy output
 #' @export
 #' @keywords internal
 get_attrs <- function(spacy_out, attr_name, deal_utf8 = FALSE) {
@@ -82,7 +82,7 @@ get_attrs <- function(spacy_out, attr_name, deal_utf8 = FALSE) {
 }
 
 #' @rdname get-functions
-#' @return \code{get_named_entities} returns a list of named entities in texts
+#' @return `get_named_entities` returns a list of named entities in texts
 #' @export
 #' @keywords internal
 get_named_entities <- function(spacy_out){
@@ -104,7 +104,7 @@ get_named_entities <- function(spacy_out){
 
 
 #' @rdname get-functions
-#' @return \code{get_dependency} returns a data.frame of dependency relations.
+#' @return `get_dependency` returns a data.frame of dependency relations.
 #' @export
 #' @keywords internal
 get_dependency <- function(spacy_out) {
@@ -123,7 +123,7 @@ get_dependency <- function(spacy_out) {
 
 
 #' @rdname get-functions
-#' @return \code{get_noun_phrases} returns a data.frame of noun phrases.
+#' @return `get_noun_phrases` returns a data.frame of noun phrases.
 #' @export
 #' @keywords internal
 get_noun_phrases <- function(spacy_out) {
@@ -156,7 +156,7 @@ get_noun_phrases <- function(spacy_out) {
 }
 
 #' @rdname get-functions
-#' @return \code{get_ntokens} returns a data.frame of dependency relations
+#' @return `get_ntokens` returns a data.frame of dependency relations
 #' @export
 #' @keywords internal
 get_ntokens <- function(spacy_out){
@@ -168,7 +168,7 @@ get_ntokens <- function(spacy_out){
 }
 
 #' @rdname get-functions
-#' @return \code{get_ntokens_by_sent} returns a data.frame of dependency
+#' @return `get_ntokens_by_sent` returns a data.frame of dependency
 #'   relations, by sentence
 #' @export
 #' @keywords internal

diff --git a/R/spacy_extract_entity.R b/R/spacy_extract_entity.R
@@ -1,33 +1,33 @@
 #' Extract named entities from texts using spaCy
 #' 
 #' This function extracts named entities from texts, based on the entity tag
-#' \code{ent} attributes of documents objects parsed by spaCy (see
-#' \url{https://spacy.io/usage/linguistic-features#section-named-entities}).
+#' `ent` attributes of documents objects parsed by spaCy (see
+#' <https://spacy.io/usage/linguistic-features#section-named-entities>).
 #' 
 #' @param x a character object or a TIF-compliant
-#'   corpus data.frame (see \url{https://github.com/ropensci/tif})
+#'   corpus data.frame (see <https://github.com/ropenscilabs/tif>)
 #' @inheritParams spacy_parse
-#' @param output type of returned object, either \code{"list"} or
-#'   \code{"data.frame"}.
-#' @param type type of named entities, either \code{named}, \code{extended}, or 
-#'   \code{all}.  See 
-#'   \url{https://spacy.io/docs/usage/entity-recognition#entity-types} for 
+#' @param output type of returned object, either `"list"` or
+#'   `"data.frame"`.
+#' @param type type of named entities, either `named`, `extended`, or 
+#'   `all`.  See 
+#'   <https://spacy.io/docs/usage/entity-recognition#entity-types> for 
 #'   details.
 #' @param ... unused
-#' @details When the option \code{output = "data.frame"} is selected, the
-#'   function returns a \code{data.frame} with the following fields.
-#'   \describe{\item{\code{text}}{contents of entity}
-#'   \item{\code{entity_type}}{type of entity (e.g. \code{ORG} for
-#'   organizations)} \item{\code{start_id}}{serial number ID of starting token.
-#'   This number corresponds with the number of \code{data.frame} returned from
-#'   \code{spacy_tokenize(x)} with default options.} \item{\code{length}}{number
+#' @details When the option `output = "data.frame"` is selected, the
+#'   function returns a `data.frame` with the following fields.
+#'   \describe{\item{`text`}{contents of entity}
+#'   \item{`entity_type`}{type of entity (e.g. `ORG` for
+#'   organizations)} \item{`start_id`}{serial number ID of starting token.
+#'   This number corresponds with the number of `data.frame` returned from
+#'   `spacy_tokenize(x)` with default options.} \item{`length`}{number
 #'   of words (tokens) included in a named entity (e.g. for an entity, "New York
-#'   Stock Exchange"", \code{length = 4})}}
+#'   Stock Exchange"", `length = 4`)}}
 #' 
-#' @return either a \code{list} or \code{data.frame} of tokens
+#' @return either a `list` or `data.frame` of tokens
 #' @export
 #' @examples
-#' \donttest{
+#' \dontrun{
 #' spacy_initialize()
 #' 
 #' txt <- c(doc1 = "The Supreme Court is located in Washington D.C.",
-Original file line number
+Diff line change
@@ Expand Up / @@ -19,3 +19,5 @@ R/unused @@
     src/spacyr.dll
     CRAN-RELEASE
     revdep
+    /doc/
+    /Meta/