satijalab · melania15 · Jan 11, 2023 · Jan 11, 2023 · Jan 11, 2023 · Jan 11, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -24,7 +24,8 @@ Remotes: welch-lab/liger,
     mojaveazure/seurat-disk,
     powellgenomicslab/Nebulosa,
     atakanekiz/CIPR-Package,
-    prabhakarlab/Banksy
+    prabhakarlab/Banksy,
+    gambalab/gficf
 Depends:
     R (>= 3.5.0)
 biocViews:
@@ -45,7 +46,7 @@ Collate:
     'internal.R'
     'alevin.R'
     'alra.R'
-    'banksy.R' 
+    'banksy.R'
     'cellbrowser.R'
     'cogaps.R'
     'conos.R'
@@ -55,11 +56,12 @@ Collate:
     'miqc.R'
     'monocle3.R'
     'presto.R'
+    'scGSEA.R'
     'tricycle.R'
     'velocity.R'
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.1.1
+RoxygenNote: 7.2.3
 Suggests:
     cipr,
     conos,

diff --git a/NAMESPACE b/NAMESPACE
@@ -23,10 +23,12 @@ export(RunPrestoAll)
 export(RunQuantileAlignSNF)
 export(RunQuantileNorm)
 export(RunSNF)
+export(RunScGSEA)
 export(RunVelocity)
 export(Runtricycle)
 export(StopCellbrowser)
 export(as.cell_data_set)
+import(RcppML)
 importFrom(BiocManager,install)
 importFrom(Matrix,Matrix)
 importFrom(Matrix,writeMM)

diff --git a/R/scGSEA.R b/R/scGSEA.R
@@ -0,0 +1,87 @@
+#' @include internal.R
+#'
+NULL
+
+#' Run Single cell Gene Set Enrichement Analysis on GF-ICF on a Seurat object
+#'
+#' This function run runScGSEA function from GF-ICF package on Seurat object. 
+#' It computes GSEA for each cells across a set of input pathways by using NMF.
+#'
+#' @param object Seurat object
+#' @param assay Assay to use, defaults to the default assay
+#' @param filterGenes Rank of NMF to use for the enrichment
+#' @param nmf.dim Rank of NMF to use for the enrichment
+#' @param geneID The type of gene names in rownames of \code{object}. It can be either 'ensamble' or 'symbol'. Default: 'ensamble'
+#' @param species The type of species in \code{object}. It can be either 'mouse' or 'human'. Default: 'human'
+#' @param category MSigDB collection abbreviation, such as H or C1
+#' @param subcategory MSigDB sub-collection abbreviation, such as CGP or BP
+#' @param rescale If different by none, pathway's activity scores are resealed as Z-score. Possible values are none, byGS or byCell. Default is none
+#'
+#' @return returns a Seurat object with pathways x cell matrix 
+#' @import RcppML
+#' @export
+
+RunScGSEA <- function(
+    object,
+    assay = NULL,
+    filterGenes = TRUE,
+    nmf.dim = 100,
+    geneID = c("ensamble","symbol"),
+    species = c("mouse", "human"),
+    category = "H",
+    subcategory = NULL,
+    verbose = F) {
+
+    # Store Seurat metadata in separate variable to enable working on different assay types. (i.e. SCT assay does not have metadata slot as it refers to RNA assay)  
+    meta.data = [email protected]
+
+    SeuratWrappers:::CheckPackage(package = 'gambalab/gficf', repository = 'github')
+    assay <- assay %||% SeuratObject::DefaultAssay(object = object)
+
+    # Get raw count matrix from Seurat object from the assay slot defined by the user
+    M <- SeuratObject::GetAssayData(object = object, slot = "counts")
+    # Data normalization and gene filtering
+    M <- gficf::gficf(M = M, filterGenes = filterGenes)
+    # Create NMF-subspace 
+    M <- gficf::runNMF(data = M, dim = nmf.dim)
+    # Create t-UMAP space
+    M <- gficf::runReduction(data = M, reduction = "umap", verbose = verbose)
+    # Compute GSEA for each cells across a set of input pathways by using NMF
+    M <- gficf::runScGSEA(data = M,
+                          geneID = geneID,
+                          species = species,
+                          category = category,
+                          subcategory = subcategory,
+                          nmf.k = nmf.dim,
+                          fdr.th = .1,
+                          rescale = 'none',
+                          verbose = verbose)
+
+    # Add cell names to enrichment results
+    raw_enrich <- t(M$scgsea$x)
+    colnames(raw_enrich) <- colnames(object)
+    # Normalize enrichment results by computing pathway's activity z-scores
+    norm_enrich <- Matrix::Matrix((raw_enrich - Matrix::rowMeans(raw_enrich)) / apply(raw_enrich, 1, sd), sparse=T)
+    # Create a new Seurat object containing the raw pathway x cell matrix in counts slot and preserving meta data (adding _byGenes tag only if clusters were already computed)
+    if('seurat_clusters' %in% colnames(meta.data)) {
+      colnames(meta.data) <- gsub('seurat_clusters', 'seurat_clusters_byGenes', colnames(meta.data))
+    }
+    path_obj <- CreateSeuratObject(counts = raw_enrich, meta.data = meta.data, assay = 'pathway')
+    # And the z-score-normalized one in data and scale.data slots
+    path_obj <- SetAssayData(object = path_obj, slot = 'data', new.data = norm_enrich)
+    path_obj <- SetAssayData(object = path_obj, slot = 'scale.data', new.data = as.matrix(norm_enrich))
+    # Store metadata of pathways retained after significance filtering
+    feat.meta <- M$scgsea$stat[M$scgsea$stat$pathway%in%colnames(M$scgsea$x), ] 
+    feat.meta <- data.frame(feat.meta, row.names = 1)
+    feat.meta$genes <- do.call(c, lapply(M$scgsea$pathways[rownames(feat.meta)], function(x) paste(x, collapse = ',')))
+    path_obj[['pathway']]@meta.features <- feat.meta
+
+    # Store dimensionality reduction results computed on genes x cells matrix
+    path_obj@reductions <- object@reductions
+    names(path_obj@reductions) <- paste0(names(path_obj@reductions), '_byGenes')
+
+    return(path_obj)
+
+}
+
+
diff --git a/docs/scGSEA.Rmd b/docs/scGSEA.Rmd
@@ -0,0 +1,182 @@
+---
+title: "Running scGSEA from gficf on Seurat Objects"
+date: 'Compiled: `r format(Sys.Date(), "%m %d %y")`'
+output: 
+  html_document:
+    df_print: kable
+  github_document:
+    html_preview: true
+    toc: true
+    toc_depth: 3
+---
+
+This vignette demonstrates how to run single-cell gene set enrichment analysis with gficf on Seurat objects. 
+
+> *Single-cell gene set enrichment analysis and transfer learning for functional annotation of scRNA-seq data*
+>
+> Franchini Melania, Pellecchia Simona , Viscido Gaetano, Gambardella Gennaro
+> 
+> NAR Genomics and Bioinformatics, Volume 5, Issue 1, March 2023, lqad024
+> 
+> doi:
+> <https://doi.org/10.1093/nargab/lqad024>
+> 
+> GitHub:
+> <https://github.com/gambalab/gficf>
+
+```{r setup, include=FALSE}
+
+knitr::opts_chunk$set(
+  tidy = TRUE,
+  tidy.opts = list(width.cutoff = 95),
+  message = FALSE,
+  warning = FALSE,
+  fig.height = 10,
+  fig.width = 20
+)
+
+```
+
+
+Prerequisites to install: 
+
+* [Seurat](https://satijalab.org/seurat/install) 
+* [SeuratData](https://github.com/satijalab/seurat-data) 
+* [SeuratWrappers](https://github.com/satijalab/seurat-wrappers)
+* [gficf](https://github.com/gambalab/gficf)
+
+```{r looad libraries}
+
+require(Seurat)
+require(SeuratData)
+require(SeuratWrappers)
+require(gficf)
+
+```
+
+# Installation
+Please install the Seurat wrapper which enables to run single-cell gene set enrichment analysis with gficf on Seurat objects as reported here:
+
+```{r install wrapper from gficf}
+
+if(!require(devtools)){install.packages("devtools")}
+devtools::install_github("gambalab/seurat-wrappers")
+
+```
+
+
+# Introduction
+GF-ICF is an R package for normalization, visualization and analysis of of single-cell RNA sequencing data, based on a data transformation model called term frequency–inverse document frequency (TF-IDF), which has been extensively used in the field of text mining. Here, we show one of the latest implementation which enables the pathway activity computation at single-cell level.
+
+## Load examle data from SeuratData
+To demonstrate how to run runScGSEA() function on a single-cell RNA-seq dataset.
+We will use the pbmc3k.final dataset from the SeuratData package which contains 2,638 PBMCs processed using the standard Seurat workflow as demonstrated in Seurat's guided clustering tutorial.
+
+```{r load example data}
+
+InstallData("pbmc3k")
+data("pbmc3k.final")
+pbmc3k.final
+
+```
+
+## Compute pathways activity at single-cell resolution
+Single cell gene set enrichment analysis is performed by the function runScGSEA() of gficf package which computes GSEA scores for each cell across a set of input pathways by using NMF.
+The list of gene sets to use can can be specified trough the category and subcategory parameters. 
+
+```{r runScGsea, results='hide'}
+
+pbmc.ptw <- RunScGSEA(object = pbmc3k.final, geneID = "symbol",  species = "human", category = "H")
+
+```
+
+
+
+# PART 1 : Evaluate pathways activity in single-cell data
+## Visualize UMAP plot computed on genes expression
+As dimensionality reduction results computed on genes expression are preserved, let's graph the UMAP computed on genes x cells matrix. In this case cells are colored by their cell type.
+
+```{r DimPlotGenes}
+
+DimPlot(pbmc.ptw, reduction = "umap_byGenes", group.by = "seurat_annotations")
+
+```
+
+## Identify how pathways activity change across cell type
+To evaluate how pathways activity change across cell type, we utilize DotPlot() function where dot size encodes for the percentage of cells of each cell type in which the given pathway is active. Finally, cell types are ordered (by hierarchical clustering) based on given features by setting cluster.idents = TRUE.
+
+```{r DotPlotPathways}
+
+library(ggplot2)
+library(RColorBrewer)
+
+all.pathways = rownames(pbmc.ptw)
+
+DotPlot(object = pbmc.ptw,
+        features = all.pathways,
+        split.by = 'seurat_annotations',
+        cols = RColorBrewer::brewer.pal(n = length(unique(pbmc.ptw$seurat_annotations)), name = 'Set1'),
+        cluster.idents = TRUE) +
+  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1, size = 10))
+
+```
+
+## Inspect pathways activation in each cell of a given cell type
+To inspect pathways activation in each cell of a given cell type, we use DoHeatmap() function. In this case, we are plotting all given pathways for each cell type.
+
+```{r DoHeatmapPathways}
+
+DoHeatmap(pbmc.ptw, features = all.pathways, group.by = "seurat_annotations", angle = 90) + NoLegend()
+
+```
+
+
+
+# PART 2 : Analyze data based on pathways activity scores
+## Visualize UMAP plot computed on pathways activity scores
+Now, let's see the UMAP computed on pathways activity scores. To this end, we follow the standard Seurat workflow which requires the computation of PCA on scaled data (i.e. pathway scores normalized by gene-set) before run the UMAP reduction. Results will be stored in separate reduction slot of the Seurat object.
+Finally, we display the UMAP plot where cells are colored by their cell type.
+
+```{r UMAPcellTypePathways, results='hide'}
+
+pbmc.ptw <- RunPCA(pbmc.ptw, features = all.pathways)
+pbmc.ptw <- RunUMAP(pbmc.ptw, dims = 1:5)
+DimPlot(pbmc.ptw, reduction = "umap", group.by = 'seurat_annotations')
+
+```
+
+## Cluster cells based on pathways activity scores
+Now that we reconstructed pathway’s activity at single cell level we can try to cluster cell according to these values using Seurat functions FindNeighbors() and FindClusters(). Finally, we graph clustering results computed both on gene expression and pathway activity scores as UMAP plot form.
+```{r UMAPclustersPathways}
+
+pbmc.ptw <- FindNeighbors(pbmc.ptw, dims = 1:10)
+pbmc.ptw <- FindClusters(pbmc.ptw, resolution = 0.5)
+DimPlot(pbmc.ptw, reduction = "umap", group.by = c('seurat_clusters_byGenes', 'seurat_clusters'))
+
+```
+
+
+# TIPS
+## How can I handle collection with an high number of genesets?
+In the case you are handling collections containing a lot of gene-sets, it could be useful to perform a feature selection step. To this end, we first store pathways metadata in a separate variable (i.e. feat.meta), then we create an empty dataframe in the meta.features slot to enable the storing of the output of the Seurat function FindVariableFeatures() setting nfeatures equal to 10% of the number of genesets used.
+
+```{r PCAselection, results='hide'}
+
+pbmc.ptw.kegg <- RunScGSEA(object = pbmc3k.final,
+                           geneID = "symbol",
+                           species = "human",
+                           category = "C2",
+                           subcategory = 'CP:KEGG')
+
+feat.meta <- pbmc.ptw.kegg[['RNA']]@meta.features
+pbmc.ptw.kegg[['RNA']]@meta.features <- data.frame(row.names = rownames(pbmc.ptw.kegg))
+nfeatures <- round(nrow(pbmc.ptw.kegg)*0.10)
+pbmc.ptw.kegg <- FindVariableFeatures(pbmc.ptw.kegg, selection.method = "vst", nfeatures = nfeatures)
+top <- head(VariableFeatures(pbmc.ptw.kegg), nfeatures)
+LabelPoints(plot = VariableFeaturePlot(pbmc.ptw.kegg), points = top, repel = TRUE)
+
+pbmc.ptw.kegg <- RunPCA(pbmc.ptw.kegg, features = VariableFeatures(object = pbmc.ptw.kegg))
+VizDimLoadings(pbmc.ptw.kegg, dims = 1:2, reduction = "pca")
+
+```
+
diff --git a/docs/scGSEA.html b/docs/scGSEA.html