diff --git a/DESCRIPTION b/DESCRIPTION index d779fcb..2e5d8b0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,7 +24,8 @@ Remotes: welch-lab/liger, mojaveazure/seurat-disk, powellgenomicslab/Nebulosa, atakanekiz/CIPR-Package, - prabhakarlab/Banksy + prabhakarlab/Banksy, + gambalab/gficf Depends: R (>= 3.5.0) biocViews: @@ -45,7 +46,7 @@ Collate: 'internal.R' 'alevin.R' 'alra.R' - 'banksy.R' + 'banksy.R' 'cellbrowser.R' 'cogaps.R' 'conos.R' @@ -55,11 +56,12 @@ Collate: 'miqc.R' 'monocle3.R' 'presto.R' + 'scGSEA.R' 'tricycle.R' 'velocity.R' Encoding: UTF-8 LazyData: true -RoxygenNote: 7.1.1 +RoxygenNote: 7.2.3 Suggests: cipr, conos, diff --git a/NAMESPACE b/NAMESPACE index f593c28..b390a47 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,10 +23,12 @@ export(RunPrestoAll) export(RunQuantileAlignSNF) export(RunQuantileNorm) export(RunSNF) +export(RunScGSEA) export(RunVelocity) export(Runtricycle) export(StopCellbrowser) export(as.cell_data_set) +import(RcppML) importFrom(BiocManager,install) importFrom(Matrix,Matrix) importFrom(Matrix,writeMM) diff --git a/R/scGSEA.R b/R/scGSEA.R new file mode 100644 index 0000000..be5f1a8 --- /dev/null +++ b/R/scGSEA.R @@ -0,0 +1,87 @@ +#' @include internal.R +#' +NULL + +#' Run Single cell Gene Set Enrichement Analysis on GF-ICF on a Seurat object +#' +#' This function run runScGSEA function from GF-ICF package on Seurat object. +#' It computes GSEA for each cells across a set of input pathways by using NMF. +#' +#' @param object Seurat object +#' @param assay Assay to use, defaults to the default assay +#' @param filterGenes Rank of NMF to use for the enrichment +#' @param nmf.dim Rank of NMF to use for the enrichment +#' @param geneID The type of gene names in rownames of \code{object}. It can be either 'ensamble' or 'symbol'. Default: 'ensamble' +#' @param species The type of species in \code{object}. It can be either 'mouse' or 'human'. Default: 'human' +#' @param category MSigDB collection abbreviation, such as H or C1 +#' @param subcategory MSigDB sub-collection abbreviation, such as CGP or BP +#' @param rescale If different by none, pathway's activity scores are resealed as Z-score. Possible values are none, byGS or byCell. Default is none +#' +#' @return returns a Seurat object with pathways x cell matrix +#' @import RcppML +#' @export + +RunScGSEA <- function( + object, + assay = NULL, + filterGenes = TRUE, + nmf.dim = 100, + geneID = c("ensamble","symbol"), + species = c("mouse", "human"), + category = "H", + subcategory = NULL, + verbose = F) { + + # Store Seurat metadata in separate variable to enable working on different assay types. (i.e. SCT assay does not have metadata slot as it refers to RNA assay) + meta.data = object@meta.data + + SeuratWrappers:::CheckPackage(package = 'gambalab/gficf', repository = 'github') + assay <- assay %||% SeuratObject::DefaultAssay(object = object) + + # Get raw count matrix from Seurat object from the assay slot defined by the user + M <- SeuratObject::GetAssayData(object = object, slot = "counts") + # Data normalization and gene filtering + M <- gficf::gficf(M = M, filterGenes = filterGenes) + # Create NMF-subspace + M <- gficf::runNMF(data = M, dim = nmf.dim) + # Create t-UMAP space + M <- gficf::runReduction(data = M, reduction = "umap", verbose = verbose) + # Compute GSEA for each cells across a set of input pathways by using NMF + M <- gficf::runScGSEA(data = M, + geneID = geneID, + species = species, + category = category, + subcategory = subcategory, + nmf.k = nmf.dim, + fdr.th = .1, + rescale = 'none', + verbose = verbose) + + # Add cell names to enrichment results + raw_enrich <- t(M$scgsea$x) + colnames(raw_enrich) <- colnames(object) + # Normalize enrichment results by computing pathway's activity z-scores + norm_enrich <- Matrix::Matrix((raw_enrich - Matrix::rowMeans(raw_enrich)) / apply(raw_enrich, 1, sd), sparse=T) + # Create a new Seurat object containing the raw pathway x cell matrix in counts slot and preserving meta data (adding _byGenes tag only if clusters were already computed) + if('seurat_clusters' %in% colnames(meta.data)) { + colnames(meta.data) <- gsub('seurat_clusters', 'seurat_clusters_byGenes', colnames(meta.data)) + } + path_obj <- CreateSeuratObject(counts = raw_enrich, meta.data = meta.data, assay = 'pathway') + # And the z-score-normalized one in data and scale.data slots + path_obj <- SetAssayData(object = path_obj, slot = 'data', new.data = norm_enrich) + path_obj <- SetAssayData(object = path_obj, slot = 'scale.data', new.data = as.matrix(norm_enrich)) + # Store metadata of pathways retained after significance filtering + feat.meta <- M$scgsea$stat[M$scgsea$stat$pathway%in%colnames(M$scgsea$x), ] + feat.meta <- data.frame(feat.meta, row.names = 1) + feat.meta$genes <- do.call(c, lapply(M$scgsea$pathways[rownames(feat.meta)], function(x) paste(x, collapse = ','))) + path_obj[['pathway']]@meta.features <- feat.meta + + # Store dimensionality reduction results computed on genes x cells matrix + path_obj@reductions <- object@reductions + names(path_obj@reductions) <- paste0(names(path_obj@reductions), '_byGenes') + + return(path_obj) + +} + + diff --git a/docs/scGSEA.Rmd b/docs/scGSEA.Rmd new file mode 100644 index 0000000..82aed3a --- /dev/null +++ b/docs/scGSEA.Rmd @@ -0,0 +1,182 @@ +--- +title: "Running scGSEA from gficf on Seurat Objects" +date: 'Compiled: `r format(Sys.Date(), "%m %d %y")`' +output: + html_document: + df_print: kable + github_document: + html_preview: true + toc: true + toc_depth: 3 +--- + +This vignette demonstrates how to run single-cell gene set enrichment analysis with gficf on Seurat objects. + +> *Single-cell gene set enrichment analysis and transfer learning for functional annotation of scRNA-seq data* +> +> Franchini Melania, Pellecchia Simona , Viscido Gaetano, Gambardella Gennaro +> +> NAR Genomics and Bioinformatics, Volume 5, Issue 1, March 2023, lqad024 +> +> doi: +> +> +> GitHub: +> + +```{r setup, include=FALSE} + +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + fig.height = 10, + fig.width = 20 +) + +``` + + +Prerequisites to install: + +* [Seurat](https://satijalab.org/seurat/install) +* [SeuratData](https://github.com/satijalab/seurat-data) +* [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) +* [gficf](https://github.com/gambalab/gficf) + +```{r looad libraries} + +require(Seurat) +require(SeuratData) +require(SeuratWrappers) +require(gficf) + +``` + +# Installation +Please install the Seurat wrapper which enables to run single-cell gene set enrichment analysis with gficf on Seurat objects as reported here: + +```{r install wrapper from gficf} + +if(!require(devtools)){install.packages("devtools")} +devtools::install_github("gambalab/seurat-wrappers") + +``` + + +# Introduction +GF-ICF is an R package for normalization, visualization and analysis of of single-cell RNA sequencing data, based on a data transformation model called term frequency–inverse document frequency (TF-IDF), which has been extensively used in the field of text mining. Here, we show one of the latest implementation which enables the pathway activity computation at single-cell level. + +## Load examle data from SeuratData +To demonstrate how to run runScGSEA() function on a single-cell RNA-seq dataset. +We will use the pbmc3k.final dataset from the SeuratData package which contains 2,638 PBMCs processed using the standard Seurat workflow as demonstrated in Seurat's guided clustering tutorial. + +```{r load example data} + +InstallData("pbmc3k") +data("pbmc3k.final") +pbmc3k.final + +``` + +## Compute pathways activity at single-cell resolution +Single cell gene set enrichment analysis is performed by the function runScGSEA() of gficf package which computes GSEA scores for each cell across a set of input pathways by using NMF. +The list of gene sets to use can can be specified trough the category and subcategory parameters. + +```{r runScGsea, results='hide'} + +pbmc.ptw <- RunScGSEA(object = pbmc3k.final, geneID = "symbol", species = "human", category = "H") + +``` + + + +# PART 1 : Evaluate pathways activity in single-cell data +## Visualize UMAP plot computed on genes expression +As dimensionality reduction results computed on genes expression are preserved, let's graph the UMAP computed on genes x cells matrix. In this case cells are colored by their cell type. + +```{r DimPlotGenes} + +DimPlot(pbmc.ptw, reduction = "umap_byGenes", group.by = "seurat_annotations") + +``` + +## Identify how pathways activity change across cell type +To evaluate how pathways activity change across cell type, we utilize DotPlot() function where dot size encodes for the percentage of cells of each cell type in which the given pathway is active. Finally, cell types are ordered (by hierarchical clustering) based on given features by setting cluster.idents = TRUE. + +```{r DotPlotPathways} + +library(ggplot2) +library(RColorBrewer) + +all.pathways = rownames(pbmc.ptw) + +DotPlot(object = pbmc.ptw, + features = all.pathways, + split.by = 'seurat_annotations', + cols = RColorBrewer::brewer.pal(n = length(unique(pbmc.ptw$seurat_annotations)), name = 'Set1'), + cluster.idents = TRUE) + + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1, size = 10)) + +``` + +## Inspect pathways activation in each cell of a given cell type +To inspect pathways activation in each cell of a given cell type, we use DoHeatmap() function. In this case, we are plotting all given pathways for each cell type. + +```{r DoHeatmapPathways} + +DoHeatmap(pbmc.ptw, features = all.pathways, group.by = "seurat_annotations", angle = 90) + NoLegend() + +``` + + + +# PART 2 : Analyze data based on pathways activity scores +## Visualize UMAP plot computed on pathways activity scores +Now, let's see the UMAP computed on pathways activity scores. To this end, we follow the standard Seurat workflow which requires the computation of PCA on scaled data (i.e. pathway scores normalized by gene-set) before run the UMAP reduction. Results will be stored in separate reduction slot of the Seurat object. +Finally, we display the UMAP plot where cells are colored by their cell type. + +```{r UMAPcellTypePathways, results='hide'} + +pbmc.ptw <- RunPCA(pbmc.ptw, features = all.pathways) +pbmc.ptw <- RunUMAP(pbmc.ptw, dims = 1:5) +DimPlot(pbmc.ptw, reduction = "umap", group.by = 'seurat_annotations') + +``` + +## Cluster cells based on pathways activity scores +Now that we reconstructed pathway’s activity at single cell level we can try to cluster cell according to these values using Seurat functions FindNeighbors() and FindClusters(). Finally, we graph clustering results computed both on gene expression and pathway activity scores as UMAP plot form. +```{r UMAPclustersPathways} + +pbmc.ptw <- FindNeighbors(pbmc.ptw, dims = 1:10) +pbmc.ptw <- FindClusters(pbmc.ptw, resolution = 0.5) +DimPlot(pbmc.ptw, reduction = "umap", group.by = c('seurat_clusters_byGenes', 'seurat_clusters')) + +``` + + +# TIPS +## How can I handle collection with an high number of genesets? +In the case you are handling collections containing a lot of gene-sets, it could be useful to perform a feature selection step. To this end, we first store pathways metadata in a separate variable (i.e. feat.meta), then we create an empty dataframe in the meta.features slot to enable the storing of the output of the Seurat function FindVariableFeatures() setting nfeatures equal to 10% of the number of genesets used. + +```{r PCAselection, results='hide'} + +pbmc.ptw.kegg <- RunScGSEA(object = pbmc3k.final, + geneID = "symbol", + species = "human", + category = "C2", + subcategory = 'CP:KEGG') + +feat.meta <- pbmc.ptw.kegg[['RNA']]@meta.features +pbmc.ptw.kegg[['RNA']]@meta.features <- data.frame(row.names = rownames(pbmc.ptw.kegg)) +nfeatures <- round(nrow(pbmc.ptw.kegg)*0.10) +pbmc.ptw.kegg <- FindVariableFeatures(pbmc.ptw.kegg, selection.method = "vst", nfeatures = nfeatures) +top <- head(VariableFeatures(pbmc.ptw.kegg), nfeatures) +LabelPoints(plot = VariableFeaturePlot(pbmc.ptw.kegg), points = top, repel = TRUE) + +pbmc.ptw.kegg <- RunPCA(pbmc.ptw.kegg, features = VariableFeatures(object = pbmc.ptw.kegg)) +VizDimLoadings(pbmc.ptw.kegg, dims = 1:2, reduction = "pca") + +``` + diff --git a/docs/scGSEA.html b/docs/scGSEA.html new file mode 100644 index 0000000..71f9756 --- /dev/null +++ b/docs/scGSEA.html @@ -0,0 +1,576 @@ + + + + + + + + + + + + + +Running scGSEA from gficf on Seurat Objects + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

This vignette demonstrates how to run single-cell gene set enrichment +analysis with gficf on Seurat objects.

+
+

Single-cell gene set enrichment analysis and transfer learning +for functional annotation of scRNA-seq data

+

Franchini Melania, Pellecchia Simona , Viscido Gaetano, Gambardella +Gennaro

+

NAR Genomics and Bioinformatics, Volume 5, Issue 1, March 2023, +lqad024

+

doi: https://doi.org/10.1093/nargab/lqad024

+

GitHub: https://github.com/gambalab/gficf

+
+

Prerequisites to install:

+ +
require(Seurat)
+require(SeuratData)
+require(SeuratWrappers)
+require(gficf)
+
+

Installation

+

Please install the Seurat wrapper which enables to run single-cell +gene set enrichment analysis with gficf on Seurat objects as reported +here:

+
if (!require(devtools)) {
+    install.packages("devtools")
+}
+devtools::install_github("gambalab/seurat-wrappers")
+
+
+

Introduction

+

GF-ICF is an R package for normalization, visualization and analysis +of of single-cell RNA sequencing data, based on a data transformation +model called term frequency–inverse document frequency (TF-IDF), which +has been extensively used in the field of text mining. Here, we show one +of the latest implementation which enables the pathway activity +computation at single-cell level.

+
+

Load examle data from SeuratData

+

To demonstrate how to run runScGSEA() function on a single-cell +RNA-seq dataset. We will use the pbmc3k.final dataset from the +SeuratData package which contains 2,638 PBMCs processed using the +standard Seurat workflow as demonstrated in Seurat’s guided clustering +tutorial.

+
InstallData("pbmc3k")
+data("pbmc3k.final")
+pbmc3k.final
+
## An object of class Seurat 
+## 13714 features across 2638 samples within 1 assay 
+## Active assay: RNA (13714 features, 2000 variable features)
+##  2 dimensional reductions calculated: pca, umap
+
+
+

Compute pathways activity at single-cell resolution

+

Single cell gene set enrichment analysis is performed by the function +runScGSEA() of gficf package which computes GSEA scores for each cell +across a set of input pathways by using NMF. The list of gene sets to +use can can be specified trough the category and subcategory +parameters.

+
pbmc.ptw <- RunScGSEA(object = pbmc3k.final, geneID = "symbol", species = "human", category = "H")
+
+
+
+

PART 1 : Evaluate pathways activity in single-cell data

+
+

Visualize UMAP plot computed on genes expression

+

As dimensionality reduction results computed on genes expression are +preserved, let’s graph the UMAP computed on genes x cells matrix. In +this case cells are colored by their cell type.

+
DimPlot(pbmc.ptw, reduction = "umap_byGenes", group.by = "seurat_annotations")
+

+
+
+

Identify how pathways activity change across cell type

+

To evaluate how pathways activity change across cell type, we utilize +DotPlot() function where dot size encodes for the percentage of cells of +each cell type in which the given pathway is active. Finally, cell types +are ordered (by hierarchical clustering) based on given features by +setting cluster.idents = TRUE.

+
library(ggplot2)
+library(RColorBrewer)
+
+all.pathways = rownames(pbmc.ptw)
+
+DotPlot(object = pbmc.ptw, features = all.pathways, split.by = "seurat_annotations", cols = RColorBrewer::brewer.pal(n = length(unique(pbmc.ptw$seurat_annotations)),
+    name = "Set1"), cluster.idents = TRUE) + theme(axis.text.x = element_text(angle = 60, vjust = 1,
+    hjust = 1, size = 10))
+

+
+
+

Inspect pathways activation in each cell of a given cell type

+

To inspect pathways activation in each cell of a given cell type, we +use DoHeatmap() function. In this case, we are plotting all given +pathways for each cell type.

+
DoHeatmap(pbmc.ptw, features = all.pathways, group.by = "seurat_annotations", angle = 90) + NoLegend()
+

+
+
+
+

PART 2 : Analyze data based on pathways activity scores

+
+

Visualize UMAP plot computed on pathways activity scores

+

Now, let’s see the UMAP computed on pathways activity scores. To this +end, we follow the standard Seurat workflow which requires the +computation of PCA on scaled data (i.e. pathway scores normalized by +gene-set) before run the UMAP reduction. Results will be stored in +separate reduction slot of the Seurat object. Finally, we display the +UMAP plot where cells are colored by their cell type.

+
pbmc.ptw <- RunPCA(pbmc.ptw, features = all.pathways)
+pbmc.ptw <- RunUMAP(pbmc.ptw, dims = 1:5)
+DimPlot(pbmc.ptw, reduction = "umap", group.by = "seurat_annotations")
+

+
+
+

Cluster cells based on pathways activity scores

+

Now that we reconstructed pathway’s activity at single cell level we +can try to cluster cell according to these values using Seurat functions +FindNeighbors() and FindClusters(). Finally, we graph clustering results +computed both on gene expression and pathway activity scores as UMAP +plot form.

+
pbmc.ptw <- FindNeighbors(pbmc.ptw, dims = 1:10)
+pbmc.ptw <- FindClusters(pbmc.ptw, resolution = 0.5)
+
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
+## 
+## Number of nodes: 2638
+## Number of edges: 85312
+## 
+## Running Louvain algorithm...
+## Maximum modularity in 10 random starts: 0.8276
+## Number of communities: 9
+## Elapsed time: 0 seconds
+
DimPlot(pbmc.ptw, reduction = "umap", group.by = c("seurat_clusters_byGenes", "seurat_clusters"))
+

+
+
+
+

TIPS

+
+

How can I handle collection with an high number of genesets?

+

In the case you are handling collections containing a lot of +gene-sets, it could be useful to perform a feature selection step. To +this end, we first store pathways metadata in a separate variable +(i.e. feat.meta), then we create an empty dataframe in the meta.features +slot to enable the storing of the output of the Seurat function +FindVariableFeatures() setting nfeatures equal to 10% of the number of +genesets used.

+
pbmc.ptw.kegg <- RunScGSEA(object = pbmc3k.final, geneID = "symbol", species = "human", category = "C2",
+    subcategory = "CP:KEGG")
+
+feat.meta <- pbmc.ptw.kegg[["RNA"]]@meta.features
+pbmc.ptw.kegg[["RNA"]]@meta.features <- data.frame(row.names = rownames(pbmc.ptw.kegg))
+nfeatures <- round(nrow(pbmc.ptw.kegg) * 0.1)
+pbmc.ptw.kegg <- FindVariableFeatures(pbmc.ptw.kegg, selection.method = "vst", nfeatures = nfeatures)
+top <- head(VariableFeatures(pbmc.ptw.kegg), nfeatures)
+LabelPoints(plot = VariableFeaturePlot(pbmc.ptw.kegg), points = top, repel = TRUE)
+

+
pbmc.ptw.kegg <- RunPCA(pbmc.ptw.kegg, features = VariableFeatures(object = pbmc.ptw.kegg))
+VizDimLoadings(pbmc.ptw.kegg, dims = 1:2, reduction = "pca")
+

+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/docs/scGSEA.md b/docs/scGSEA.md new file mode 100644 index 0000000..5d71040 --- /dev/null +++ b/docs/scGSEA.md @@ -0,0 +1,220 @@ +Running ScGSEA from gficf on Seurat Objects +================ +Compiled: gennaio 12, 2023 + +- [Introduction](#introduction) + - [Load examle data from + SeuratData](#load-examle-data-from-seuratdata) + - [Compute pathways activity at single-cell + resolution](#compute-pathways-activity-at-single-cell-resolution) +- [PART 1 : Evaluate pathways activity in single-cell + data](#part-1--evaluate-pathways-activity-in-single-cell-data) + - [Visualize UMAP plot computed on genes + expression](#visualize-umap-plot-computed-on-genes-expression) + - [Identify how pathways activity change across cell + type](#identify-how-pathways-activity-change-across-cell-type) + - [Inspect pathways activation in each cell of a given cell + type](#inspect-pathways-activation-in-each-cell-of-a-given-cell-type) +- [PART 2 : Analyze data based on pathways activity + scores](#part-2--analyze-data-based-on-pathways-activity-scores) + - [Visualize UMAP plot computed on pathways activity + scores](#visualize-umap-plot-computed-on-pathways-activity-scores) + - [Cluster cells based on pathways activity + scores](#cluster-cells-based-on-pathways-activity-scores) +- [TIPS](#tips) + - [How can I handle collection with an high number of + genesets?](#how-can-i-handle-collection-with-an-high-number-of-genesets) + +This vignette demonstrates how to run single-cell gene set enrichment +analysis with gficf on Seurat objects. + +> *Single-cell gene set enrichment analysis and transfer learning for +> functional annotation of scRNA-seq data* +> +> Pellecchia Simona , Viscido Gaetano, Franchini Melania, Gambardella +> Gennaro +> +> bioRxiv, 2022 +> +> doi: +> +> GitHub: + +Prerequisites to install: + +- [Seurat](https://satijalab.org/seurat/install) +- [SeuratData](https://github.com/satijalab/seurat-data) +- [SeuratWrappers](https://github.com/satijalab/seurat-wrappers) +- [gficf](https://github.com/gambalab/gficf) + +``` r +library(Seurat) +library(SeuratData) +library(SeuratWrappers) +library(gficf) +``` + +# Introduction + +GF-ICF is an R package for normalization, visualization and analysis of +of single-cell RNA sequencing data, based on a data transformation model +called term frequency–inverse document frequency (TF-IDF), which has +been extensively used in the field of text mining. Here, we show one of +the latest implementation which enables the pathway activity computation +at single-cell level. + +## Load examle data from SeuratData + +To demonstrate how to run runScGSEA() function on a single-cell RNA-seq +dataset. We will use the pbmc3k.final dataset from the SeuratData +package which contains 2,638 PBMCs processed using the standard Seurat +workflow as demonstrated in Seurat’s guided clustering tutorial. + +``` r +InstallData("pbmc3k") +data("pbmc3k.final") +pbmc3k.final +``` + + ## An object of class Seurat + ## 13714 features across 2638 samples within 1 assay + ## Active assay: RNA (13714 features, 2000 variable features) + ## 2 dimensional reductions calculated: pca, umap + +## Compute pathways activity at single-cell resolution + +Single cell gene set enrichment analysis is performed by the function +runScGSEA() of gficf package which computes GSEA scores for each cell +across a set of input pathways by using NMF. The list of gene sets to +use can can be specified trough the category and subcategory parameters. + +``` r +pbmc.ptw <- RunScGSEA(object = pbmc3k.final, geneID = "symbol", species = "human", category = "H") +``` + +# PART 1 : Evaluate pathways activity in single-cell data + +## Visualize UMAP plot computed on genes expression + +As dimensionality reduction results computed on genes expression are +preserved, let’s graph the UMAP computed on genes x cells matrix. In +this case cells are colored by their cell type. + +``` r +DimPlot(pbmc.ptw, reduction = "umap_byGenes", group.by = "seurat_annotations") +``` + +![](scGSEA_files/figure-gfm/DimPlotGenes-1.png) + +## Identify how pathways activity change across cell type + +To evaluate how pathways activity change across cell type, we utilize +DotPlot() function where dot size encodes for the percentage of cells of +each cell type in which the given pathway is active. Finally, cell types +are ordered (by hierarchical clustering) based on given features by +setting cluster.idents = TRUE. + +``` r +library(ggplot2) +library(RColorBrewer) + +all.pathways = rownames(pbmc.ptw) + +DotPlot(object = pbmc.ptw, features = all.pathways, split.by = "seurat_annotations", cols = RColorBrewer::brewer.pal(n = length(unique(pbmc.ptw$seurat_annotations)), + name = "Set1"), cluster.idents = TRUE) + theme(axis.text.x = element_text(angle = 60, vjust = 1, + hjust = 1, size = 10)) +``` + +![](scGSEA_files/figure-gfm/DotPlotPathways-1.png) + +## Inspect pathways activation in each cell of a given cell type + +To inspect pathways activation in each cell of a given cell type, we use +DoHeatmap() function. In this case, we are plotting all given pathways +for each cell type. + +``` r +DoHeatmap(pbmc.ptw, features = all.pathways, group.by = "seurat_annotations", angle = 90) + NoLegend() +``` + +![](scGSEA_files/figure-gfm/DoHeatmapPathways-1.png) + +# PART 2 : Analyze data based on pathways activity scores + +## Visualize UMAP plot computed on pathways activity scores + +Now, let’s see the UMAP computed on pathways activity scores. To this +end, we follow the standard Seurat workflow which requires the +computation of PCA on scaled data (i.e. pathway scores normalized by +gene-set) before run the UMAP reduction. Results will be stored in +separate reduction slot of the Seurat object. Finally, we display the +UMAP plot where cells are colored by their cell type. + +``` r +pbmc.ptw <- RunPCA(pbmc.ptw, features = all.pathways) +pbmc.ptw <- RunUMAP(pbmc.ptw, dims = 1:5) +DimPlot(pbmc.ptw, reduction = "umap", group.by = "seurat_annotations") +``` + +![](scGSEA_files/figure-gfm/UMAPcellTypePathways-1.png) + +## Cluster cells based on pathways activity scores + +Now that we reconstructed pathway’s activity at single cell level we can +try to cluster cell according to these values using Seurat functions +FindNeighbors() and FindClusters(). Finally, we graph clustering results +computed both on gene expression and pathway activity scores as UMAP +plot form. + +``` r +pbmc.ptw <- FindNeighbors(pbmc.ptw, dims = 1:10) +pbmc.ptw <- FindClusters(pbmc.ptw, resolution = 0.5) +``` + + ## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck + ## + ## Number of nodes: 2638 + ## Number of edges: 85312 + ## + ## Running Louvain algorithm... + ## Maximum modularity in 10 random starts: 0.8276 + ## Number of communities: 9 + ## Elapsed time: 0 seconds + +``` r +DimPlot(pbmc.ptw, reduction = "umap", group.by = c("seurat_clusters_byGenes", "seurat_clusters")) +``` + +![](scGSEA_files/figure-gfm/UMAPclustersPathways-1.png) + +# TIPS + +## How can I handle collection with an high number of genesets? + +In the case you are handling collections containing a lot of gene-sets, +it could be useful to perform a feature selection step. To this end, we +first store pathways metadata in a separate variable (i.e. feat.meta), +then we create an empty dataframe in the meta.features slot to enable +the storing of the output of the Seurat function FindVariableFeatures() +setting nfeatures equal to 10% of the number of genesets used. + +``` r +pbmc.ptw.kegg <- RunScGSEA(object = pbmc3k.final, geneID = "symbol", species = "human", category = "C2", + subcategory = "CP:KEGG") + +feat.meta <- pbmc.ptw.kegg[["RNA"]]@meta.features +pbmc.ptw.kegg[["RNA"]]@meta.features <- data.frame(row.names = rownames(pbmc.ptw.kegg)) +nfeatures <- round(nrow(pbmc.ptw.kegg) * 0.1) +pbmc.ptw.kegg <- FindVariableFeatures(pbmc.ptw.kegg, selection.method = "vst", nfeatures = nfeatures) +top <- head(VariableFeatures(pbmc.ptw.kegg), nfeatures) +LabelPoints(plot = VariableFeaturePlot(pbmc.ptw.kegg), points = top, repel = TRUE) +``` + +![](scGSEA_files/figure-gfm/PCAselection-1.png) + +``` r +pbmc.ptw.kegg <- RunPCA(pbmc.ptw.kegg, features = VariableFeatures(object = pbmc.ptw.kegg)) +VizDimLoadings(pbmc.ptw.kegg, dims = 1:2, reduction = "pca") +``` + +![](scGSEA_files/figure-gfm/PCAselection-2.png) diff --git a/docs/scGSEA_files/figure-gfm/DimPlotGenes-1.png b/docs/scGSEA_files/figure-gfm/DimPlotGenes-1.png new file mode 100644 index 0000000..2dd56df Binary files /dev/null and b/docs/scGSEA_files/figure-gfm/DimPlotGenes-1.png differ diff --git a/docs/scGSEA_files/figure-gfm/DoHeatmapPathways-1.png b/docs/scGSEA_files/figure-gfm/DoHeatmapPathways-1.png new file mode 100644 index 0000000..8f812a1 Binary files /dev/null and b/docs/scGSEA_files/figure-gfm/DoHeatmapPathways-1.png differ diff --git a/docs/scGSEA_files/figure-gfm/DotPlotPathways-1.png b/docs/scGSEA_files/figure-gfm/DotPlotPathways-1.png new file mode 100644 index 0000000..58e11a4 Binary files /dev/null and b/docs/scGSEA_files/figure-gfm/DotPlotPathways-1.png differ diff --git a/docs/scGSEA_files/figure-gfm/PCAselection-1.png b/docs/scGSEA_files/figure-gfm/PCAselection-1.png new file mode 100644 index 0000000..a8eb0f9 Binary files /dev/null and b/docs/scGSEA_files/figure-gfm/PCAselection-1.png differ diff --git a/docs/scGSEA_files/figure-gfm/PCAselection-2.png b/docs/scGSEA_files/figure-gfm/PCAselection-2.png new file mode 100644 index 0000000..8b23d01 Binary files /dev/null and b/docs/scGSEA_files/figure-gfm/PCAselection-2.png differ diff --git a/docs/scGSEA_files/figure-gfm/UMAPcellTypePathways-1.png b/docs/scGSEA_files/figure-gfm/UMAPcellTypePathways-1.png new file mode 100644 index 0000000..d108d01 Binary files /dev/null and b/docs/scGSEA_files/figure-gfm/UMAPcellTypePathways-1.png differ diff --git a/docs/scGSEA_files/figure-gfm/UMAPclustersPathways-1.png b/docs/scGSEA_files/figure-gfm/UMAPclustersPathways-1.png new file mode 100644 index 0000000..d28bbe5 Binary files /dev/null and b/docs/scGSEA_files/figure-gfm/UMAPclustersPathways-1.png differ diff --git a/man/PlotMiQC.Rd b/man/PlotMiQC.Rd index 07e9d46..64d43a3 100644 --- a/man/PlotMiQC.Rd +++ b/man/PlotMiQC.Rd @@ -14,9 +14,6 @@ PlotMiQC( } \arguments{ \item{object}{Seurat object} -} -\value{ - } \description{ Run miQC on a Seurat object diff --git a/man/RunPresto.Rd b/man/RunPresto.Rd index 18972d9..e32299f 100644 --- a/man/RunPresto.Rd +++ b/man/RunPresto.Rd @@ -25,7 +25,6 @@ RunPresto( latent.vars = NULL, min.cells.feature = 3, min.cells.group = 3, - pseudocount.use = 1, mean.fxn = NULL, fc.name = NULL, base = 2, diff --git a/man/RunPrestoAll.Rd b/man/RunPrestoAll.Rd index 5cf5c91..9c64c4f 100644 --- a/man/RunPrestoAll.Rd +++ b/man/RunPrestoAll.Rd @@ -22,7 +22,6 @@ RunPrestoAll( latent.vars = NULL, min.cells.feature = 3, min.cells.group = 3, - pseudocount.use = 1, mean.fxn = NULL, fc.name = NULL, base = 2, diff --git a/man/RunScGSEA.Rd b/man/RunScGSEA.Rd new file mode 100644 index 0000000..6f03452 --- /dev/null +++ b/man/RunScGSEA.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/scGSEA.R +\name{RunScGSEA} +\alias{RunScGSEA} +\title{Run Single cell Gene Set Enrichement Analysis on GF-ICF on a Seurat object} +\usage{ +RunScGSEA( + object, + assay = NULL, + filterGenes = TRUE, + nmf.dim = 100, + geneID = c("ensamble", "symbol"), + species = c("mouse", "human"), + category = "H", + subcategory = NULL, + verbose = F +) +} +\arguments{ +\item{object}{Seurat object} + +\item{assay}{Assay to use, defaults to the default assay} + +\item{filterGenes}{Rank of NMF to use for the enrichment} + +\item{nmf.dim}{Rank of NMF to use for the enrichment} + +\item{geneID}{The type of gene names in rownames of \code{object}. It can be either 'ensamble' or 'symbol'. Default: 'ensamble'} + +\item{species}{The type of species in \code{object}. It can be either 'mouse' or 'human'. Default: 'human'} + +\item{category}{MSigDB collection abbreviation, such as H or C1} + +\item{subcategory}{MSigDB sub-collection abbreviation, such as CGP or BP} + +\item{rescale}{If different by none, pathway's activity scores are resealed as Z-score. Possible values are none, byGS or byCell. Default is none} +} +\value{ +returns a Seurat object with pathways x cell matrix +} +\description{ +This function run runScGSEA function from GF-ICF package on Seurat object. +It computes GSEA for each cells across a set of input pathways by using NMF. +}