考虑到咱们生信技能树粉丝对单细胞数据挖掘的需求,我开通了一个专栏《100个单细胞转录组数据降维聚类分群图表复现 》,也亲自示范了几个,不过自己带娃,读博,时间精力有限,所以把剩余的90多个任务安排了学徒,实习生,学员。真的是太棒了 ,群策群力!
看到Jimmy老师开启了《100篇单细胞转录组文献计划》,我连忙报名了,申领了一个任务,是解读发表于2021年3月,广东省人民医院肺癌研究所团队在《Journal For Immunotherapy Of Cancer》(IF=10.252)杂志的,一篇题为“Multiomics analysis reveals a distinct response mechanism in multiple primary lung adenocarcinoma after neoadjuvant immunotherapy”
该研究对1例进行了3个周期派姆单抗治疗(pembrolizumab)的早期MPLC患者进行多组学分析,发现患者不同结节间表现出高度异质性的基因组表型和免疫原性差异,还揭示了Trm浸润增加可能是PD-1阻断后早期免疫反应的一个独特标记,这些都对MPLC的诊断治疗方案和预后标志物的发现开拓了思路。
本研究生成的单细胞RNA测序数据集可在GEO数据库中获得,注册号为登录号GSE146100, 读者们只需要下载 到 GSE146100_NormData.txt 文件,就可以完完全全复制粘贴我下面的代码,马上获得一个单细胞转录组数据分析经验哦!
Step 0、加载需要的R包library (Seurat)library (dplyr)library (patchwork)library (mindr)library (Matrix)
Step 1、数据准备setwd("..\\GSE146100_NormData.txt" ) pbmc.data<-read.table("GSE146100_NormData.txt" ,sep = "\t" ,header = T ,row.names = 1 ) pbmc.data[1 :4 ,1 :4 ] pbmc.data_sparse<-as(as.matrix(pbmc.data),"dgCMatrix" ) pbmc.data_sparse[1 :4 ,1 :4 ] object.size(pbmc.data_sparse)#95033248 bytes save(pbmc.data_sparse,file = "pbmc.data_sparse.Rdata" ) dim(pbmc.data_sparse) rm(pbmc.data)
Step 2、创建Seurat对象#创建Seurat对象 pbmc<- CreateSeuratObject(counts = pbmc.data_sparse, project = "GSE146100" , min.cells = 3 , min.features = 200 ) pbmc#去除线粒体基因 grep(pattern = "^MT\\." ,rownames(pbmc),value = T ) pbmc[["percent.mt" ]] <- PercentageFeatureSet(pbmc, pattern = "^MT\\." ) head(pbmc@meta.data,5 ) summary(pbmc@meta.data$nCount_RNA) VlnPlot(pbmc, features = c("nFeature_RNA" , "nCount_RNA" , "percent.mt" ), ncol = 3 )#已经去除了 data<-pbmc@meta.datalibrary (ggplot2) p<-ggplot(data = data,aes(x=nFeature_RNA))+geom_density() p plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA" , feature2 = "percent.mt" ) plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA" , feature2 = "nFeature_RNA" ) plot1 + plot2#保险起见还是把流程都走一下 pbmc1 <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 80 ) pbmc1 VlnPlot(pbmc1, features = c("nFeature_RNA" , "nCount_RNA" , "percent.mt" ), ncol = 3 )
Step 3、标准化pbmc1 <- NormalizeData(pbmc1, normalization.method = "LogNormalize" , scale.factor = 10000 ) normalized.data<-pbmc1[["RNA" ]]@data normalized.data[1 :10 ,1 :4 ]#10 x 4 sparse Matrix of class "dgCMatrix" # W1_AAACCCACATGTTCAG.1 W1_AAACCCATCAACGCTA.1 W1_AAACGAAAGCAATTAG.1 W1_AAACGAACAAGTCCCG.1 #AL627309.1 . . . .
Step 4、鉴定高变基因#变异指标 pbmc1 <- FindVariableFeatures(pbmc1, selection.method = "vst" , nfeatures = 2500 )#最显著的十个基因 top10 <- head(VariableFeatures(pbmc1), 10 ) top10 plot1 <- VariableFeaturePlot(pbmc1) plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE ) plot1 + plot2
Step 5、降维all.genes <- rownames(pbmc1) pbmc1 <- ScaleData(pbmc1, features = all.genes) pbmc1 <- RunPCA(pbmc1, features = VariableFeatures(object = pbmc1)) print(pbmc1[["pca" ]], dims = 1 :5 , nfeatures = 5 ) VizDimLoadings(pbmc1, dims = 1 :2 , reduction = "pca" ) DimPlot(pbmc1, reduction = "pca" ) DimHeatmap(pbmc1, dims = 1 , cells = 500 , balanced = TRUE ) DimHeatmap(pbmc1, dims = 1 :20 , cells = 500 , balanced = TRUE ) pbmc1 <- JackStraw(pbmc1, num.replicate = 100 ) pbmc1 <- ScoreJackStraw(pbmc1, dims = 1 :20 ) JackStrawPlot(pbmc1, dims = 1 :20 ) ElbowPlot(pbmc1)
Step 6、细胞聚类参考前面的例子:人人都能学会的单细胞聚类分群注释 ,第一次分群就非常漂亮!我们就直奔主题,看T细胞的细分亚群!
pbmc1 <- FindNeighbors(pbmc1, dims = 1 :10 ) pbmc1 <- FindClusters(pbmc1, resolution = 0.8 ) head(Idents(pbmc1), 5 ) head(pbmc1@meta.data) table(pbmc1@meta.data$seurat_clusters) pbmc1 <- RunUMAP(pbmc1, dims = 1 :10 ) DimPlot(pbmc1, reduction = "umap" ,label = T ,label.size = 5 )
Step 7、细胞注释这个时候根据 CD8+ 的marker 基因,比如 "GZMK","CD8B","CD8A" ,确定了上面umap图中的 0,1,4,6,11 亚群就是 我们所需要的 CD8+ 的T细胞亚群:
#手动注释 genes_to_check = c("GZMK" ,"CD8B" ,"CD8A" ) #CD8+的marker DotPlot(pbmc1, group.by = 'seurat_clusters' , features = unique(genes_to_check)) + RotatedAxis() cd8_sce = pbmc1[,pbmc1@meta.data$seurat_clusters %in % c(0 ,1 ,4 ,6 ,11 )] celltype=data.frame(ClusterID=0 :15 , celltype='other' ) celltype[celltype$ClusterID %in % c(0 ,1 ,4 ,6 ,11 ),2 ]='CD8+T' head(celltype) pbmc1@meta.data$celltype = "NA" for (i in 1 :nrow(celltype)){ pbmc1@meta.data[which(pbmc1@meta.data$seurat_clusters == celltype$ClusterID[i]),'celltype' ] <- celltype$celltype[i]} table(pbmc1@meta.data$celltype)#CD8+T other #2805 2384 #与原文比较接近 p3<-DimPlot(pbmc1, reduction = "umap" , group.by = "celltype" ,label = T ) p3 save(pbmc1,file = "marker_annotion.RData" )
Step 8、CD8+细胞亚群提取分析#提取CD8T细胞 rm(list=ls()) load("marker_annotion.RData" ) levels(Idents(pbmc1)) pbmc<-pbmc1[,pbmc1@meta.data$seurat_clusters %in % c(0 ,1 ,4 ,6 ,11 )]#CD8T pbmc levels(Idents(pbmc)) save(pbmc,file = "pbmc_CD8.Rdata" ) load("pbmc_CD8.Rdata" ) pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize" , scale.factor = 1e4 ) pbmc <- FindVariableFeatures(pbmc, selection.method = 'vst' , nfeatures = 2000 ) pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt" ) pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) pbmc <- FindNeighbors(pbmc, dims = 1 :10 ) pbmc <- FindClusters(pbmc, resolution = 0.6 ) head(Idents(pbmc), 5 ) table(pbmc$seurat_clusters) pbmc <- RunUMAP(pbmc, dims = 1 :10 ) DimPlot(pbmc, reduction = 'umap' ) genes_to_check = c("ITGAE" ,"GZMK" ,"CD69" ,"CCR7" ,"HAVCR2" , "FGFBP2" ,"IL7R" ,"SLC4A10" ,"MKI67" ) DotPlot(pbmc, group.by = 'seurat_clusters' , features = unique(genes_to_check)) + RotatedAxis() p1=DimPlot(pbmc, reduction = 'umap' , group.by = 'seurat_clusters' , label = TRUE , pt.size = 0.5 ) + NoLegend() p2=DotPlot(pbmc, group.by = 'seurat_clusters' , features = unique(genes_to_check)) + RotatedAxis()library (patchwork) p1+p2
CD8+细胞亚群提前成功后,继续继续细分亚群,降维聚类分群,如下所示:
可以看到这细分的10个亚群,仍然是各自有自己的高表达量基因,可以进行命名啦!
Step 9、CD8+细胞亚群命名celltype=data.frame(ClusterID=0 :9 , celltype='other' ) celltype[celltype$ClusterID %in % c(0 ),2 ]='CD8-C1-ITGAE' celltype[celltype$ClusterID %in % c(1 ),2 ]='CD8-C2-GZMK' celltype[celltype$ClusterID %in % c(2 ),2 ]='CD8-C3-CD69' celltype[celltype$ClusterID %in % c(3 ),2 ]='CD8-C4-CCR7' celltype[celltype$ClusterID %in % c(4 ),2 ]='CD8-C5-HAVCR2' celltype[celltype$ClusterID %in % c(5 ),2 ]='CD8-C6-FGFBP2' celltype[celltype$ClusterID %in % c(6 ),2 ]='CD8-C7-IL7R' celltype[celltype$ClusterID %in % c(7 ),2 ]='CD8-C8-SLC4A10' celltype[celltype$ClusterID %in % c(8 ),2 ]='CD8-C9-MKI67' head(celltype) pbmc@meta.data$celltype = "NA" for (i in 1 :nrow(celltype)){ pbmc@meta.data[which(pbmc@meta.data$seurat_clusters == celltype$ClusterID[i]),'celltype' ] <- celltype$celltype[i]} table(pbmc@meta.data$celltype) p<-DimPlot(pbmc, reduction = "umap" , group.by = "celltype" ,label = T ) p
文献中图看起来复现的还不错哦!
不过,这样的亚群命名并不是主流哦!在,2021-02-02 , DOI: 10.1016/j.jcmgh.2021.01.020 ,文章名字是:《Identification of Novel Population-Specific Cell Subsets in Chinese Ulcerative Colitis Patients Using Single-Cell RNA Sequencing》把T细胞可以简单分成4类,如下所示:
CD8+ T cell/natural killer cell, 写在文末 咱们现在这个专栏《 100个单细胞转录组数据降维聚类分群图表复现 》分享的代码是到此为止,但是一般来说单细胞文章数据分析还有很多进阶图表制作,比如inferCNV看肿瘤拷贝数变异,monocle看拟时序等等。如果你也需要,可以加入我们这个专栏《 100个单细胞转录组数据降维聚类分群图表复现 》 创作团队,获取进阶指引哦!见:急!计划招募100个单细胞爱好者,免费学全套单细胞降维聚类分群和生物学亚群注释
文末友情推荐 与十万人一起学生信,你值得拥有下面的马拉松学习班: