Installing SuperCell package from gitHub
if (!requireNamespace("remotes")) install.packages("remotes")
::install_github("GfellerLab/SuperCell") remotes
library(SuperCell)
Data available at authors’ GitHub under file name sincell_with_class_5cl.Rdata.
data(cell_lines) # list with GE - gene expression matrix (logcounts), meta - cell meta data
<- cell_lines$GE
GE dim(GE) # genes as rows and cells as columns
#> Loading required package: Matrix
#> NULL
<- cell_lines$meta cell.meta
(i.e., 20
times less metacells (called ‘supercells’ in
the package functions) than single cells) by first building a kNN (\(k=5\)) network using top \(n.var.genes=1000\) most variable genes for
dimentionality reduction. Function SCimplify()
computes the
partition into metacells, this information is available with the field
membership
.
<- 20 # graining level
gamma <- 5
k.knn
# Building metacells from gene expressio (GE)
<- SCimplify(GE, # gene expression matrix
SC k.knn = k.knn, # number of nearest neighbors to build kNN network
gamma = gamma, # graining level
n.var.genes = 1000 # number of the top variable genes to use for dimentionality reduction
)
# Alternative, metacells can be build from low-dimensional embedding. For this, first compute low-dim embedding and pass it into \code{SCimplify_from_embedding()}
if(1){
<- SCimplify_from_embedding(
SC_alt X = stats::prcomp(Matrix::t(GE[SC$genes.use,]), rank. = 10)$x, # PCA embedding
k.knn = k.knn, # number of nearest neighbors to build kNN network
gamma = gamma # graining level)
)
}
# plot network of metacells
supercell_plot(SC$graph.supercells, # network
color.use = "gray", # color of the nodes
main = paste("Metacell network, gamma =", gamma),
seed = 1)
# plot single-cell network
supercell_plot(SC$graph.singlecell, # network
group = cell.meta, # colored by cell line assignment
do.frames = F, # not drawing frames around each node
main = paste("Single-cell network, N =", dim(GE)[2]),
lay.method = "components") # method to compute the network 2D embedding
To get a gene expression of metacells, we need to average gene
expressions within each metacell with function
supercell_GE()
<- supercell_GE(GE, SC$membership)
SC.GE dim(SC.GE)
#> [1] 2000 196
We now assign each metcell to a particular cell line based on the
cell line data, for this, we use function
supercell_assign()
. By default, this function assign each
metacell to a cluster with the largest Jaccard coefficient to avoid
biases towards very rare or very abundant clusters. Alternatively,
assigmnent can be performed using relative (may cause biase towards very
small populations) or absolute (may cause biase towards large
populations) abundance with method = "relative"
or
method = "absolute"
, respectively.
$cell_line <- supercell_assign(clusters = cell.meta, # single-cell assigment to cell lines (clusters)
SCsupercell_membership = SC$membership, # single-cell assignment to metacells
method = "jaccard")
<- 1 # seed for network plotting
seed
# plot network of metacells colored by cell line assignment
supercell_plot(SC$graph.supercells,
group = SC$cell_line,
seed = seed,
main = "Metacells colored by cell line assignment")
The quality of assignment can be evaluated with metacell purity
(function supercell_purity()
) that returns the proportion
of the most abundant cell type (in this case, cell line) in each
metacell.
# compute purity of metacells in terms of cell line composition
<- supercell_purity(clusters = cell.meta,
purity supercell_membership = SC$membership, method = 'entropy')
hist(purity, main = "Purity of metacells \nin terms of cell line composition")
$purity <- purity SC
Some options to plot networks of metacells
## rotate network to be more consistent with the single-cell one
supercell_plot(SC$graph.supercells,
group = SC$cell_line,
seed = seed,
alpha = -pi/2,
main = "Metacells colored by cell line assignment (rotated)")
## alternatively, any layout can be provided as 2xN numerical matrix, where N is number of nodes (cells)
## Let's plot metacell network using the layout of the single-cell network:
## 1) get single-cell network layout
<- igraph::layout_components(SC$graph.singlecell)
my.lay.sc
## 2) compute metacell network layout averaging coordinates withing metacells
<- Matrix::t(supercell_GE(ge = t(my.lay.sc), groups = SC$membership))
my.lay.SC
## 3) provide layout with the parameter $lay$
supercell_plot(SC$graph.supercells,
group = SC$cell_line,
lay = my.lay.SC,
main = "Metacells colored by cell line assignment (averaged coordinates)")
#dimensionality reduction
<- supercell_prcomp(Matrix::t(SC.GE), # metacell gene expression matrix
SC.PCA genes.use = SC$genes.use, # genes used for the coarse-graining, but any set can be provided
supercell_size = SC$supercell_size, # sample-weighted pca
k = 20)
## compute distance
<- dist(SC.PCA$x)
D
## cluster metacells
<- supercell_cluster(D = D, k = 5, supercell_size = SC$supercell_size)
SC.clusters $clustering <- SC.clusters$clustering SC
## mapping metacell cluster to cell line
<- supercell_assign(supercell_membership = SC$clustering, clusters = SC$cell_line)
map.cluster.to.cell.line ## clustering as cell line
$clustering_reordered <- map.cluster.to.cell.line[SC$clustering]
SC
supercell_plot(SC$graph.supercells,
group = SC$clustering_reordered,
seed = seed,
alpha = -pi/2,
main = "Metacells colored by cluster")
<- supercell_FindAllMarkers(ge = SC.GE, # metacell gene expression matrix
markers.all.positive supercell_size = SC$supercell_size, # size of metacell for sample-weighted method
clusters = SC$clustering_reordered, # clustering
logfc.threshold = 1, # mininum log fold-change
only.pos = T) # keep only upregulated genes
$H2228[1:20,]
markers.all.positive#> p.value adj.p.value pct.1 pct.2 logFC w.mean.1 w.mean.2
#> CD74 0 0 1 0.9547755 4.819945 5.141367 0.33886734
#> SAA1 0 0 1 0.9743833 4.808625 5.692774 0.66076230
#> S100A9 0 0 1 0.9772296 4.736743 3.263651 0.48934905
#> CXCL1 0 0 1 0.9756483 4.720862 5.075143 0.33463836
#> CXCL6 0 0 1 0.9127135 4.084244 3.798647 0.15469295
#> LCN2 0 0 1 0.9984187 4.023480 6.908461 1.45023391
#> HLA-B 0 0 1 0.9955724 3.588336 5.081833 1.22688095
#> HLA-C 0 0 1 0.9690070 3.404755 4.171374 0.72304466
#> CXCL8 0 0 1 0.8437698 3.112256 3.159659 0.15125884
#> HLA-DRA 0 0 1 0.7966477 3.063288 2.759934 0.08436613
#> BIRC3 0 0 1 0.8478811 3.038374 3.178506 0.19329216
#> IGFBP3 0 0 1 0.9955724 2.977667 4.460458 1.32173831
#> HLA-DRB1 0 0 1 0.7375079 2.897079 2.656783 0.06368299
#> CCL20 0 0 1 0.7783049 2.800380 2.734728 0.08422758
#> MMP7 0 0 1 0.9623656 2.688850 3.073111 0.54129093
#> HLA-A 0 0 1 0.9987350 2.678104 4.565705 1.90320058
#> SAA2 0 0 1 0.6907021 2.613938 2.575958 0.12673623
#> HLA-DRB5 0 0 1 0.5638836 2.404624 2.269505 0.03905744
#> PDZK1IP1 0 0 1 0.8782416 2.362958 3.344296 0.50619818
#> HLA-DPA1 0 0 1 0.5866540 2.337615 2.093134 0.04609662
<- c("DHRS2", "MT1P1", "TFF1", "G6PD", "CD74", "CXCL8")
genes.to.plot
supercell_VlnPlot(ge = SC.GE,
supercell_size = SC$supercell_size,
clusters = SC$clustering_reordered,
features = genes.to.plot,
idents = c("H1975", "H2228", "A549"),
ncol = 3)
supercell_GeneGenePlot(ge = SC.GE,
gene_x = genes.to.plot[1:3],
gene_y = genes.to.plot[4:6],
supercell_size = SC$supercell_size,
clusters = SC$clustering_reordered,)
#> $p
#>
#> $w.cor
#> $w.cor$TFF1_CXCL8
#> [1] -0.2457515
#>
#> $w.cor$DHRS2_G6PD
#> [1] -0.2069167
#>
#> $w.cor$MT1P1_CD74
#> [1] 0.08581136
#>
#>
#> $w.pval
#> $w.pval$TFF1_CXCL8
#> [1] 5.511979e-55
#>
#> $w.pval$DHRS2_G6PD
#> [1] 3.781028e-39
#>
#> $w.pval$MT1P1_CD74
#> [1] 7.470102e-08
supercell_rescale()
function<- supercell_rescale(SC, gamma = 10)
SC10
$cell_line <- supercell_assign(clusters = cell.meta, # single-cell assigment to cell lines (clusters)
SC10supercell_membership = SC10$membership, # single-cell assignment to metacells
method = "jaccard")
supercell_plot(SC10$graph.supercells,
group = SC10$cell_line,
seed = 1,
main = "Metacells at gamma = 10 colored by cell line assignment")
### don't forget to recompute metacell gene expression matrix for a new grainig level with
# GE10 <- supercell_GE(GE, SC10$membership)
### if you are going to perform downstream analyses at the new graining level
In case you want to perform other analyses available with Seurat
package, we can convert SuperCell to Seurat
object with function supercell_2_Seurat()
or to SingleCellExperiment
object with function ‘supercell_2_sce()’. Let consider a Seurat
example.
#install.packages("Seurat")
library(Seurat)
#> The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
#> which was just loaded, will retire in October 2023.
#> Please refer to R-spatial evolution reports for details, especially
#> https://r-spatial.org/r/2023/05/15/evolution4.html.
#> It may be desirable to make the sf package available;
#> package maintainers should consider adding sf to Suggests:.
#> The sp package is now running under evolution status 2
#> (status 2 uses the sf package in place of rgdal)
#> Attaching SeuratObject
<- supercell_2_Seurat(SC.GE = as.matrix(SC.GE), SC = SC, fields = c("cell_line", "clustering", "clustering_reordered"))
m.seurat #> [1] "Done: NormalizeData"
#> [1] "Doing: data to normalized data"
#> [1] "Doing: weighted scaling"
#> [1] "Done: weighted scaling"
#> Computing nearest neighbor graph
#> Computing SNN
Note: since metacells have different size (consist of different
number of cells), we apply sample-weighted algorithms at most af the
steps of the downstream analyses. Thus, when coercing SuperCell to
Seurat, we replaced PCA, saling and kNN graph of Seurat object with
those obtained applying sample-weighted version of PCA, scaling or
SuperCell graph (i.e., metacell network), respectively. If you then
again apply RunPCA
, ScaleData
, or
FindNeighbors
, the result will be rewritten, but you will
be able to access them with
Embeddings(m.seurat, reduction = "pca_weigted")
,
m.seurat@assays$RNA@misc[["scale.data.weighted"]]
, or
m.seurat@graphs$RNA_super_cells
, respectively.
PCAPlot(m.seurat)
### cluster SuperCell network (unweighted clustering)
<- FindClusters(m.seurat, graph.name = "RNA_nn") # now RNA_nn is metacell network
m.seurat #> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 196
#> Number of edges: 703
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8322
#> Number of communities: 7
#> Elapsed time: 0 seconds
<- FindNeighbors(m.seurat, verbose = FALSE) # RNA_nn has been replaced with kNN graph of metacell (unweigted)
m.seurat <- FindClusters(m.seurat, graph.name = "RNA_nn")
m.seurat #> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 196
#> Number of edges: 2010
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7908
#> Number of communities: 5
#> Elapsed time: 0 seconds