Seurat包(1):标准流程

构建Seurat

示例一致的10X的数据格式

数据处理成seurat包的输入格式,Seurat直接读取文件夹,文件夹包括三个文件,两个注释信息,一个matrix

1 barcodes.tsv

2 genes.tsv

3 matrix.mtx

两个 key functions

Read10X

CreateSeuratObject

单细胞数据第一种数据类型构建Seurat对象

scRNA_data1 <- Read10X(data.dir = "data/scType1/")

# 构建Seurat对象
srt1 <- CreateSeuratObject(counts = scRNA_data1, 
                            # 表达矩阵,可以是稀疏矩阵,也可以是普通矩阵
                            min.cells = 3, # 去除在小于3个细胞中表达的基因
                            min.features = 200 # 去除只有200个以下基因表达 的细胞
)


# c查看数据类型
typeof(srt1) # "S4"
class(srt1) # "dgCMatrix"

# 计算稀疏矩阵的内存大小
sparse.size <- object.size(x = srt1)
sparse.size

# 查看稀疏矩阵数据
srt1@assays[["RNA"]][1:10, 1:4]
head(srt1@meta.data)

单细胞数据第二种数据类型构建Seurat对象

rm(list = ls())
scRNA_data2 <- read.table("datascType2/GSE118389_counts_rsem.txt")
scRNA_data2[1:4,1:4]
dim(scRNA_data2) 

#原始矩阵有1534个cell,21785个gene
SRT2 <- read.table("data/scType2/SraRunTable.txt", sep = ",", header = T)#这里可以用rio包,读取速度感人
dim(SRT2)
head(SRT2)
# 通过观察,表达矩阵scRNA_data2:1534个cell,21785个gene
# 通过观察SRT2了解到表达矩阵scRNA_data2的细胞名在metadata数据中没找到,那么就到Seriex matrix中寻找对应数据,发现了有细胞名对应的GSM编号,因此提取这两列出来做处理
Cell_ID <- read.table("data/scType2/cellID.txt", sep = "\t", header = T)
head(Cell_ID)

# 将样本名合并到metadata数据中
SRT2_1 <- SRT2 %>%
  inner_join(Cell_ID, "Sample.Name")
head(SRT2_1) # 保留数据备用
write_tsv(SRT2_1, "SRT2_1.txt")

# 构建metadata只需要细胞名称和对应的GSM编号即可,也就是我们从Seriex matrix提取的两列数据
# 匹配表达矩阵和Cell_ID中的细胞名称,让两者顺序一致
scRNA_data2_1 <- scRNA_data2[,Cell_ID$MTX.name]
table(Cell_ID$MTX.name == colnames(scRNA_data2_1))
Cell_ID <- column_to_rownames(Cell_ID, "MTX.name")
# 两者一致后就可以构建Seurat对象
srt2 <- CreateSeuratObject(counts = scRNA_data2_1,
                           meta.data = Cell_ID,
                           min.cells = 3, 
                           min.features = 200)

单细胞数据第三种数据类型构建Seurat对象

# 保留Tumor组的数据进行合并
rm(list = ls())

# 引用包
library(limma)
setwd("")     #设置工作目录
files <- dir()      
files
# 获取目录下所有文件
files <- grep("txt$", files, value=T)      # 提取.txt结尾的文件
files

# 数据合并
scRNA_data3 <- data.frame()
for(i in 1:length(files)){
  inputFile <- files[i]
  # 读取输入文件,并对输入文件进行整理
  rt <- read.table(inputFile, header=T, sep="\t")
  rownames(rt) = rt[,1]
  rt = rt[,2:ncol(rt)]
  
  #数据合并
  if(ncol(scRNA_data3) == 0){
    scRNA_data3 = rt
  }else{
    scRNA_data3 = cbind(scRNA_data3, rt)
  }
}

dim(scRNA_data3) # 306个细胞,24153个基因

# 写出数据

scRNA_data3_Tumor <- rownames_to_column(scRNA_data3, "ID")
write_tsv(scRNA_data3_Tumor, "scRNA_data3_Tumor.txt")

# 同样方法处理对照组数据
rm(list = ls())

# 引用包
setwd("")     #设置工作目录
files <- dir()      
files
# 获取目录下所有文件
files <- grep("txt$", files, value=T)      # 提取.txt结尾的文件
files

# 数据合并
scRNA_data3 <- data.frame()
for(i in 1:length(files)){
  inputFile <- files[i]
  # 读取输入文件,并对输入文件进行整理
  rt <- read.table(inputFile, header=T, sep="\t")
  rownames(rt) = rt[,1]
  rt = rt[,2:ncol(rt)]
  
  #数据合并
  if(ncol(scRNA_data3) == 0){
    scRNA_data3 = rt
  }else{
    scRNA_data3 = cbind(scRNA_data3, rt)
  }
}

dim(scRNA_data3) # 306个细胞,24153个基因

# 写出数据

scRNA_data3_Control <- rownames_to_column(scRNA_data3, "ID")
write_tsv(scRNA_data3_Control, "scRNA_data3_Control.txt")

# 将两个矩阵转移到scType3文件夹下方进行后续处理
rm(list = ls())

setwd("")     #设置工作目录
scRNA_data3_Control <- read.table("data/scType3/scRNA_data3_Control.txt", header = T, row.names = 1)

srt3_1 <- CreateSeuratObject(counts = scRNA_data3_Control,
                           min.cells = 3, 
                           min.features = 200)

scRNA_data3_Tumor <- read.table("data/scType3/scRNA_data3_Tumor.txt", header = T, row.names = 1)

srt3_2 <- CreateSeuratObject(counts = scRNA_data3_Tumor,
                             min.cells = 3, 
                             min.features = 200)
srt3_list <- list(Control = srt3_1, 
                  Tumor = srt3_2)

raw data,

matrix,

Rdata


质控

PercentageFeatureSet 算一下某个自定义基因集在各个细胞中的比例

# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
#增加percent.mt这一列
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")

# Show QC metrics for the first 5 cells,增加后的矩阵
head(pbmc@meta.data, 5)#show 前5个细胞

# Visualize QC metrics as a violin plot可视化
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

FeatureScatter

# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
#两个特征之间的关系
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2#
#过滤具有超过 2500 或少于 200 feature
#过滤线粒体计数小于5%的细胞
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

对数据进行标准化

NormalizeData

pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

筛选高变基因(找差异)

它们在某些细胞中高度表达,而在其他细胞中表达低

FindVariableFeatures

pbmc <- FindVariableFeatures(pbmc, 
                             selection.method = "vst", 
                             nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, 
                     points = top10, 
                     repel = TRUE)
plot1 + plot2
纵坐标是方差,横坐标是平均表达量,红色的点就代表了高变基因,黑色的点代表了变化不大的基因

降维

降维有三种

线性PCA

非线性

t-SNE分析,基于PCA分析结果进行非线性降维分析

UMAP分析,t-SNE的完美替代品,速度快

RunPCA

RunTSNE

RunUMAP

rm(list = ls())
library(Seurat)
library(tidyverse)
library(stringr)
library(patchwork)
load("srt3_SCT_final.Rda")
dim(srt3_all)

# PCA分析
srt3_all <- RunPCA(object = srt3_all,
                   npcs = 50,
                   rev.pca = FALSE,
                   weight.by.var = TRUE,
                   verbose = TRUE, # 输出特定PC值的特征基因
                   ndims.print = 1:5, # 输出前5个PC值的特征基因
                   nfeatures.print = 30, # 输出每个PC值的30个特征基因
                   reduction.key = "PC_")

srt3_all <- RunPCA(srt3_all)

ElbowPlot(srt3_all,
          ndims = 50) # 寻找对细胞差异贡献度较大的主成分,

# 展示降维特征及其特征基因的热图
DimHeatmap(object = srt3_all, 
           reduction = "pca", 
           dims = c(1:5),
           nfeatures = 30,
           disp.min = -2.5,
           balanced = TRUE,
           projected = FALSE,
           fast = TRUE,
           raster = TRUE,
           slot = "scale.data",
           combine = TRUE)

# 按照分组观察是否具有批次效应
p1 <- DimPlot(object = srt3_all, 
              reduction = "pca", 
              group.by = "Group",
              dims = c(1,2),
              shuffle = TRUE,
              label = TRUE,
              label.size = 4,
              label.color = "black",
              label.box = TRUE,
              sizes.highlight = 1)
p1
# 按照细胞周期阶段观察是否具有批次效应
p2 <- DimPlot(object = srt3_all, 
              reduction = "pca",
              group.by = "Phase",
              dims = c(1,2),
              shuffle = TRUE,
              label = TRUE,
              label.size = 4,
              label.color = "black",
              label.box = TRUE,
              sizes.highlight = 1)
p2

# t-SNE分析,基于PCA分析结果进行非线性降维分析
srt3_all <- RunTSNE(srt3_all, 
                    dims = 1:19) 
p3 <- DimPlot(object = srt3_all, 
              reduction = "tsne", 
              group.by = "Group",
              dims = c(1,2),
              shuffle = TRUE,
              label = TRUE,
              label.size = 4,
              label.color = "black",
              label.box = TRUE,
              sizes.highlight = 1)
p3

p4 <- DimPlot(object = srt3_all, 
              reduction = "tsne", 
              group.by = "Phase",
              dims = c(1,2),
              shuffle = TRUE,
              label = TRUE,
              label.size = 4,
              label.color = "black",
              label.box = TRUE,
              sizes.highlight = 1)
p4
# UMAP分析,t-SNE的完美替代品,速度快,结果解释多
srt3_all <- RunUMAP(srt3_all, 
                    dims = 1:19)
p5 <- DimPlot(object = srt3_all, 
              reduction = "umap", 
              group.by = "Group",
              dims = c(1,2),
              shuffle = TRUE,
              label = TRUE,
              label.size = 4,
              label.color = "black",
              label.box = TRUE,
              sizes.highlight = 1)
p5

p6 <- DimPlot(object = srt3_all, 
              reduction = "umap", 
              group.by = "Phase",
              dims = c(1,2),
              shuffle = TRUE,
              label = TRUE,
              label.size = 4,
              label.color = "black",
              label.box = TRUE,
              sizes.highlight = 1)
p6

# 合并三种降维方法的结果
library(cowplot)
plot_grid(p1, p3, p5,
          labels = c("A", "B", "C"), 
          nrow = 1,
          align = "h")

library(patchwork)
p2 + p4 + p6 + 
  plot_layout(guides = "collect") & 
  theme(legend.position = "bottom")
# 从上面结果看,未见明显批次效应

# 检查线粒体比例是否为协变量
# FeaturePlot(srt3_all, 
#             reduction = "umap", 
#             features = "percent.mt")
# 展示特定基因
FeaturePlot(srt3_all, 
            reduction = "umap", 
            features = "LIPF")

聚类

# KNN聚类法
srt3_all <- FindNeighbors(srt3_all,
                          k.param = 20,
                          dims = 1:19)
srt3_all <- FindClusters(srt3_all,
                         resolution = 0.8, # 最重要参数,该值越大,cluster 越多
                         method = "igraph", # 根据结果调整
                         algorithm = 1, # 根据结果调整'
                         random.seed = 2021) 

table(srt3_all@meta.data$seurat_clusters)
DimPlot(object = srt3_all,
        group.by = "seurat_clusters",
        reduction = "pca",
        label = TRUE)

PCAPlot(object = srt3_all,
        group.by = "seurat_clusters",
        pt.size = 2, 
        label = TRUE)

TSNEPlot(object = srt3_all,
         group.by = "seurat_clusters",
         pt.size = 2, 
         label = TRUE)    

UMAPPlot(object = srt3_all,
         group.by = "seurat_clusters",
         pt.size = 2, 
         label = TRUE)
save(srt3_all, file = "srt3_all_cluster.Rda")

细胞注释

rm(list = ls())
library(tidyverse)
library(reshape2)
library(patchwork)
library(Seurat)

load(file = "srt3_all_cluster.Rda")

# 从CellMarker下载细胞注释表格
# CellMarker:http://bio-bigdata.hrbmu.edu.cn/CellMarker/index.jsp
cell_marker <- read.csv("CellMarker.csv")
cell_marker <- cell_marker[c(3, 5)]
?separate_rows
cell_marker <- cell_marker %>%
  separate_rows(Cell.Marker, sep = ", ") %>%
  na.omit() %>%
  distinct() %>% 
  arrange(Cell.Type) 
View(cell_marker)
cell_marker$Cell.Marker <- str_trim(cell_marker$Cell.Marker, "left") 

p1 <- DimPlot(srt3_all, 
              reduction = "umap", # pca, umap, tsne
              group.by  = "seurat_clusters",
              label = T)

p2 <- DotPlot(srt3_all, 
              features = unique(cell_marker$Cell.Marker)) +
  theme(axis.text = element_text(size = 8,
                                 angle = 90,
                                 hjust = 1))
p1 + p2

p3 <- FeaturePlot(srt3_all, 
                  reduction = "umap", 
                  features = c("ALDH1A1"), 
                  label = TRUE) 
p3
p1 + p3


range(srt3_all@assays$integrated@scale.data)

# 添加 cell type
Cell_type <- c("0" = "Other Cells",
               "1" = "Cancer Stem Cells")
srt3_all[['cell_type']] <- unname(Cell_type[srt3_all@meta.data$seurat_clusters])

DimPlot(srt3_all, 
        reduction = "umap", 
        group.by = "cell_type",
        label = TRUE, 
        pt.size = 2) + 
  NoLegend()

# VlnPlot(srt3_all, 
#         features = c("ALDH1A1"))

save(srt3_all, file = "srt3_all_cell.Rda")

###SingleR注释###
library(SingleR)
library(celldex)
library(BiocParallel)
# celldex: https://github.com/LTLA/celldex/tree/master/R
# ref <- BlueprintEncodeData() 
# ref <- MouseRNAseqData() 
# ref <- ImmGenData() 
# ref <- DatabaseImmuneCellExpressionData() 
# ref <- NovershternHematopoieticData() 
# ref <- MonacoImmuneData()
# ref <- celldex::HumanPrimaryCellAtlasData()
# save(ref, file = "HumanPrimaryCellAtlasData.Rda")
load("HumanPrimaryCellAtlasData.Rda")

singler <- SingleR(test = srt3_all@assays$RNA@data,
                   ref = ref,
                   labels = ref$label.main,
                   clusters = srt3_all@meta.data$seurat_clusters,
                   fine.tune = TRUE, 
                   BPPARAM = SnowParam(8))
# 查看细胞类型
singler$pruned.labels

#绘制带cell label的tsne和umap图
cell_ids <- singler$pruned.labels
names(cell_ids) <- levels(srt3_all)
levels(srt3_all)

srt3_all <- RenameIdents(srt3_all, cell_ids)
levels(srt3_all)

TSNEPlot(object = srt3_all, pt.size = 0.5, label = TRUE)  
UMAPPlot(object = srt3_all, pt.size = 0.5, label = TRUE)

# 导出注释结果
clusterAnn <- singler %>%
  as.data.frame() %>%
  rownames_to_column("id") %>%
  dplyr::select(id, labels)
write_tsv(clusterAnn, "cluster_annotation.txt")

singler2 <- SingleR(test = srt3_all@assays$RNA@counts, 
                    ref = ref, 
                    labels = ref$label.main)
cellAnn <- singler2 %>%
  as.data.frame() %>%
  rownames_to_column("id") %>%
  dplyr::select(id, labels)
write_tsv(cellAnn, "cell_annotation.txt")

# 将数据写入metadata文件
singleR_type <- srt3_all@active.ident
srt3_all[['singleR_type']] <- unname(singleR_type[rownames(srt3_all@meta.data)])

最近发现的读取数据好用的包包 rio :一站式导入导出几乎所有格式的数据,import()/export(),读取速度感人。