Source file: notebooks/annotation/annotation_bmmc1.Rmd
Last updated: 2018-05-29
Code version: 3ea6d7c
library(ggplot2)
library(dplyr)
library(dropestr)
library(dropEstAnalysis)
library(Matrix)
theme_set(theme_base)
set.seed(42)
kDropEstData <- '../../data/dropest/10x/frozen_bmmc_healthy_donor1/'
kEstFolder <- paste0(kDropEstData, 'est_11_10_umi_quality/')
k10xFolder <- paste0(kDropEstData, 'filtered_matrices_mex/hg19/')
kAnnotationData <- '../../data/annotation/'
kOutputPath <- '../../output/'
holder <- readRDS(paste0(kEstFolder, 'bmmc_no_umi.rds'))
genes <- read.table(paste0(k10xFolder, 'genes.tsv')) %>%
filter(V2 %in% names(which(table(V2) == 1)))
gene_id_to_names <- setNames(genes$V2, genes$V1)
holder$cm <- holder$cm[grep("^[^;]+$", rownames(holder$cm)),]
holder$cm_raw <- holder$cm_raw[grep("^[^;]+$", rownames(holder$cm_raw)),]
umis_per_cell <- sort(Matrix::colSums(holder$cm_raw), decreasing=T)
est_cell_num <- EstimateCellsNumber(umis_per_cell)
Quality scores:
scores <- ScorePipelineCells(holder, mit.chromosome.name='MT',
predict.all=T, verbose=T)[names(umis_per_cell)]
Explained variance after PCA: 99.05%; used 3 PCs.
Used features: ReadsPerUmi, LowExpressedGenesFrac, IntergenicFrac.
PlotCellScores(scores, cells.number=est_cell_num)
Pagoda run:
real_cbs <- names(scores)[1:est_cell_num$expected]
real_cbs <- real_cbs[scores[real_cbs] > 0.5]
real_cbs <- union(names(scores)[scores > 0.9], real_cbs)
r_cm <- holder$cm_raw[, real_cbs]
r_cm <- r_cm[intersect(rownames(r_cm), names(gene_id_to_names)), ]
rownames(r_cm) <- gene_id_to_names[rownames(r_cm)]
pgd <- GetPagoda(r_cm, n.cores=30, tsne.iter.num=5000)
2920 cells, 7949 genes; normalizing ... using plain model winsorizing ... log scale ... done.
calculating variance fit ... using gam 148 overdispersed genes ... 148 persisting ... done.
running PCA using 1000 OD genes .... done
calculating distance ... pearson ...running tSNE using 30 cores:
# clusters <- pgd$clusters$PCA$infomap
# write.csv(clusters, paste0(kAnnotationData, 'bmmc1_clusters.csv'))
# Pagoda uses stochastic clustering algorithm, so we saved clusters from one run
clusters <- read.csv(paste0(kAnnotationData, 'bmmc1_clusters.csv'), row.names=1)
clusters <- setNames(clusters$x, rownames(clusters))
log_mtx <- log10(1e-3 + as.matrix(pgd$counts[names(clusters), ]))
Initial clustering:
PlotPagodaEmbeding(pgd, clusters=clusters, show.ticks=F)
Description:
* https://www.bdbiosciences.com/documents/Bcell_Brochure.pdf - B cells
* https://www.bdbiosciences.com/documents/cd_marker_handbook.pdf - CD Markers
de_genes <- pgd$getDifferentialGenes(type='PCA', groups=clusters,
upregulated.only=T) %>% lapply(rownames)
major_cell_types <- lst(
`T cells` = sapply(de_genes, function(genes) 'CD3D' %in% genes) %>%
which() %>% names() %>% as.integer(),
`B cells` = sapply(de_genes, function(genes) 'MS4A1' %in% genes) %>%
which() %>% names() %>% as.integer()
)
major_type_clusts <- major_cell_types %>% unlist()
if (length(major_type_clusts) != length(unique(major_type_clusts)))
stop("Something goes wrong")
heatmap_genes <- c(
'CD19', 'MME', 'MS4A1',
'CD3D',
'LYZ', 'CD14',
'GZMA', 'GZMB', 'GNLY', 'NKG7',
'FCER1A', 'CST3',
'CD34', 'PTPRC', 'ITGB1', 'ENG',
'EPCAM', 'APOE',
'GYPA', 'CD36', 'TFRC'
)
heatmap_clusters <- clusters[!(clusters %in% unlist(major_cell_types))]
heatmap_clusters <- heatmap_clusters[heatmap_clusters > 9]
PlotExpressionHeatmap(log_mtx, heatmap_clusters, heatmap_genes)
type_ids <- c(major_cell_types, lst(
`CD14+ Monocytes` = c(2),
`NK cells` = c(4),
`Dendritic cells` = c(14),
`Monocyte progenitors` = c(11, 15),
`Epithelial cells` = c(9),
`Erythroblasts` = c(5, 6, 8)
))
type_ids$`B cells` <- c(type_ids$`B cells`, 10, 12:13)
type_ids$`T cells` <- c(type_ids$`T cells`, 16:17)
markers_df <- data.frame(
Type = c("B cells", "T cells", "CD14+ Monocytes", "NK cells", "Dendritic cells",
"Monocyte progenitors", "Epithelial cells", "Erythroblasts"),
Markers = c("CD19, MME (CD10), MS4A1 (CD20)", "CD3D",
"LYZ, CD14", "GZMA, GZMB, GNLY, NKG7", "FCER1A, CST3",
"LYZ, CD34",
"EPCAM (CD326), CD226-, APOE (CD165)", "GYPA (CD235a), CD36, TFRC (CD71)")
)
markers_df$Clusters <- sapply(type_ids, paste, collapse=", ")[as.character(markers_df$Type)]
markers_df
Type | Markers | Clusters |
---|---|---|
B cells | CD19, MME (CD10), MS4A1 (CD20) | 3, 10, 12, 13 |
T cells | CD3D | 1, 7, 16, 17 |
CD14+ Monocytes | LYZ, CD14 | 2 |
NK cells | GZMA, GZMB, GNLY, NKG7 | 4 |
Dendritic cells | FCER1A, CST3 | 14 |
Monocyte progenitors | LYZ, CD34 | 11, 15 |
Epithelial cells | EPCAM (CD326), CD226-, APOE (CD165) | 9 |
Erythroblasts | GYPA (CD235a), CD36, TFRC (CD71) | 5, 6, 8 |
clusters_annotated <- AnnotateClusters(clusters, type_ids)
PlotClustering(pgd, clusters_annotated)
heatmap_genes <- c(
'MS4A1', 'CD40', 'IL4R', 'IL7R',
'CD34', 'CD38', 'MME',
'CD19',
'CD37', 'IGLL5')
heatmap_clusters <- clusters[clusters %in% type_ids$`B cells`]
# heatmap_clusters <- heatmap_clusters[heatmap_clusters > 9]
PlotExpressionHeatmap(log_mtx, heatmap_clusters, heatmap_genes)
b_markers_df <- data.frame(
Type = c("Immature B cells", "Pre-pro B cells", "Pre B cells", "Non-dividing Pre B cells"),
Markers = c("MS4A1 (CD20), CD40, IL4R, IL7R-", "CD34, CD38, MME (CD10), CD24-, IL7R-",
"CD34-, CD40-, IL7R+, IL4R-, CD19+", "CD34, IGLL5")
)
type_ids <- c(type_ids, lst(
`Immature B cells` = c(3),
`Pre-pro B cells` = c(13),
`Non-dividing Pre B cells` = c(10),
`Pre B cells` = c(12)
))
type_ids$`B cells` <- NULL
b_markers_df$Clusters <- sapply(type_ids, paste, collapse=", ")[as.character(b_markers_df$Type)]
b_markers_df
Type | Markers | Clusters |
---|---|---|
Immature B cells | MS4A1 (CD20), CD40, IL4R, IL7R- | 3 |
Pre-pro B cells | CD34, CD38, MME (CD10), CD24-, IL7R- | 13 |
Pre B cells | CD34-, CD40-, IL7R+, IL4R-, CD19+ | 12 |
Non-dividing Pre B cells | CD34, IGLL5 | 10 |
heatmap_genes <- c('CCR7', "CD3E", "CD8B", "SELL", "GNLY", "GZMA", "GZMB", "GZMH",
"GZMK", "PRF1", "NKG7", "IL7R", "CD4")
heatmap_clusters <- clusters[clusters %in% type_ids$`T cells`]
PlotExpressionHeatmap(log_mtx, heatmap_clusters, heatmap_genes)
t_markers_df <- data.frame(
Type = c("Cytotoxic T cells", "T cells"),
Markers = c("NKG7, GZMA, GZMH, GZMK", "CD3E, CD8B, IL7R")
)
type_ids <- c(type_ids, lst(
`Cytotoxic T cells` = c(7)
))
type_ids$`T cells` <- setdiff(type_ids$`T cells`, type_ids$`Cytotoxic T cells`)
t_markers_df$Clusters <- sapply(type_ids, paste, collapse=", ")[as.character(t_markers_df$Type)]
t_markers_df
Type | Markers | Clusters |
---|---|---|
Cytotoxic T cells | NKG7, GZMA, GZMH, GZMK | 7 |
T cells | CD3E, CD8B, IL7R | 1, 16, 17 |
clusters_annotated <- AnnotateClusters(clusters, type_ids)
write.csv(data.frame(Barcode=names(clusters_annotated),
Type=as.vector(clusters_annotated)),
paste0(kAnnotationData, 'bmmc1_clusters_annotated.csv'))
all_markers_df <- bind_rows(list(markers_df, t_markers_df, b_markers_df))
write.csv(all_markers_df[c('Type', 'Markers')],
paste0(kOutputPath, 'tables/annotation_bmmc1_markers.csv'), row.names=F)
all_markers_df
Type | Markers | Clusters |
---|---|---|
B cells | CD19, MME (CD10), MS4A1 (CD20) | 3, 10, 12, 13 |
T cells | CD3D | 1, 7, 16, 17 |
CD14+ Monocytes | LYZ, CD14 | 2 |
NK cells | GZMA, GZMB, GNLY, NKG7 | 4 |
Dendritic cells | FCER1A, CST3 | 14 |
Monocyte progenitors | LYZ, CD34 | 11, 15 |
Epithelial cells | EPCAM (CD326), CD226-, APOE (CD165) | 9 |
Erythroblasts | GYPA (CD235a), CD36, TFRC (CD71) | 5, 6, 8 |
Cytotoxic T cells | NKG7, GZMA, GZMH, GZMK | 7 |
T cells | CD3E, CD8B, IL7R | 1, 16, 17 |
Immature B cells | MS4A1 (CD20), CD40, IL4R, IL7R- | 3 |
Pre-pro B cells | CD34, CD38, MME (CD10), CD24-, IL7R- | 13 |
Pre B cells | CD34-, CD40-, IL7R+, IL4R-, CD19+ | 12 |
Non-dividing Pre B cells | CD34, IGLL5 | 10 |
plot_mtx <- apply(log_mtx, 2, function(vec) scales::rescale(rank(vec)))
raster_width <- 8 / 3
raster_height <- 8 / 4
raster_dpi <- 150
clusters_annotated <- AnnotateClusters(clusters, type_ids)
plot_clusters_annotated <- clusters_annotated
long_type_names <- c("CD14+ Monocytes", "Non-dividing Pre B cells", "Monocyte progenitors",
"Epithelial cells", "Cytotoxic T cells", "Immature B cells",
"Dendritic cells", "Pre-pro B cells")
for (type in long_type_names) {
plot_clusters_annotated[plot_clusters_annotated == type] <- sub(" ", "\n", type)
}
gg_annotated <- PlotClustering(pgd, plot_clusters_annotated, lineheight=0.9, size=0.3,
raster=T, raster.width=raster_width,
raster.height=raster_height, raster.dpi=raster_dpi) +
scale_size_continuous(range=c(2, 3)) +
theme(axis.title=element_blank(), plot.margin=margin())
plot_genes <- c('CD3D', 'NKG7',
'MS4A1', 'IGLL5', 'CD37',
'GYPA', 'EPCAM',
'FCER1A', 'CD14', 'LYZ', 'CD34')
gene_plots <- lapply(plot_genes, PlotGeneFraction, pgd, plot_mtx, title.x=0.04,
title.y=0.99, legend.position="none", size=0.3,
raster=T, raster.width=raster_width,
raster.height=raster_height, raster.dpi=raster_dpi)
gene_plots <- c(list(gg_annotated), gene_plots)
gg_fig <- cowplot::plot_grid(plotlist=gene_plots, ncol=3) +
theme(plot.margin=margin(1, 1, 1, 1))
gg_fig
ggsave(paste0(kOutputPath, 'figures/supp_annotation_bmmc1.pdf'), width=8, height=8)
value | |
---|---|
version | R version 3.4.1 (2017-06-30) |
os | Ubuntu 14.04.5 LTS |
system | x86_64, linux-gnu |
ui | X11 |
language | (EN) |
collate | en_US.UTF-8 |
tz | America/New_York |
date | 2018-05-29 |
package | loadedversion | date | source | |
---|---|---|---|---|
1 | AnnotationDbi | 1.32.3 | 2016-01-28 | Bioconductor |
2 | assertthat | 0.2.0 | 2017-04-11 | CRAN (R 3.4.1) |
3 | backports | 1.1.2 | 2017-12-13 | CRAN (R 3.4.1) |
5 | base64enc | 0.1-3 | 2015-07-28 | cran (@0.1-3) |
6 | bindr | 0.1 | 2016-11-13 | CRAN (R 3.4.1) |
7 | bindrcpp | 0.2 | 2017-06-17 | CRAN (R 3.4.1) |
8 | Biobase | 2.30.0 | 2016-01-28 | Bioconductor |
9 | BiocGenerics | 0.16.1 | 2016-01-28 | Bioconductor |
10 | bit | 1.1-12 | 2014-04-09 | CRAN (R 3.4.1) |
11 | bit64 | 0.9-7 | 2017-05-08 | CRAN (R 3.4.1) |
12 | blob | 1.1.0 | 2017-06-17 | CRAN (R 3.4.1) |
13 | brew | 1.0-6 | 2011-04-13 | CRAN (R 3.4.1) |
14 | Cairo | 1.5-9 | 2015-09-26 | CRAN (R 3.4.1) |
15 | clisymbols | 1.2.0 | 2017-05-21 | CRAN (R 3.4.1) |
16 | colorspace | 1.3-2 | 2016-12-14 | CRAN (R 3.4.1) |
18 | cowplot | 0.9.2 | 2017-12-17 | CRAN (R 3.4.1) |
20 | DBI | 1.0.0 | 2018-05-02 | CRAN (R 3.4.1) |
21 | dendsort | 0.3.3 | 2015-12-14 | cran (@0.3.3) |
22 | digest | 0.6.15 | 2018-01-28 | cran (@0.6.15) |
23 | dplyr | 0.7.4 | 2017-09-28 | CRAN (R 3.4.1) |
24 | dropEstAnalysis | 0.6.0 | 2018-05-16 | local (VPetukhov/dropEstAnalysis@NA) |
25 | dropestr | 0.7.7 | 2018-03-17 | local (@0.7.7) |
26 | evaluate | 0.10.1 | 2017-06-24 | CRAN (R 3.4.1) |
27 | ggplot2 | 2.2.1 | 2016-12-30 | CRAN (R 3.4.1) |
28 | ggrastr | 0.1.5 | 2017-12-28 | Github (VPetukhov/ggrastr@cc56b45) |
29 | ggrepel | 0.7.0 | 2017-09-29 | CRAN (R 3.4.1) |
30 | git2r | 0.21.0 | 2018-01-04 | cran (@0.21.0) |
31 | glue | 1.2.0 | 2017-10-29 | CRAN (R 3.4.1) |
32 | GO.db | 3.2.2 | 2017-11-12 | Bioconductor |
36 | gtable | 0.2.0 | 2016-02-26 | CRAN (R 3.4.1) |
37 | highr | 0.6 | 2016-05-09 | CRAN (R 3.4.1) |
38 | htmltools | 0.3.6 | 2017-04-28 | CRAN (R 3.4.1) |
39 | igraph | 1.2.1 | 2018-03-10 | cran (@1.2.1) |
40 | IRanges | 2.4.8 | 2016-09-15 | Bioconductor |
41 | irlba | 2.3.2 | 2018-01-11 | cran (@2.3.2) |
42 | KernSmooth | 2.23-15 | 2015-06-29 | CRAN (R 3.4.0) |
43 | knitr | 1.20 | 2018-02-20 | cran (@1.20) |
44 | ks | 1.11.0 | 2018-01-16 | local (VPetukhov/ks@NA) |
45 | labeling | 0.3 | 2014-08-23 | CRAN (R 3.4.1) |
46 | lattice | 0.20-35 | 2017-03-25 | CRAN (R 3.4.1) |
47 | lazyeval | 0.2.1 | 2017-10-29 | CRAN (R 3.4.1) |
48 | magrittr | 1.5 | 2014-11-22 | CRAN (R 3.4.1) |
49 | MASS | 7.3-47 | 2017-04-21 | CRAN (R 3.4.0) |
50 | Matrix | 1.2-12 | 2017-11-16 | CRAN (R 3.4.1) |
51 | mclust | 5.4 | 2017-11-22 | CRAN (R 3.4.1) |
52 | memoise | 1.1.0 | 2017-04-21 | CRAN (R 3.4.1) |
54 | mgcv | 1.8-22 | 2017-09-19 | CRAN (R 3.4.1) |
55 | munsell | 0.4.3 | 2016-02-13 | CRAN (R 3.4.1) |
56 | mvtnorm | 1.0-7 | 2018-01-26 | cran (@1.0-7) |
57 | nlme | 3.1-131 | 2017-02-06 | CRAN (R 3.4.0) |
58 | pagoda2 | 0.0.0.9002 | 2018-04-08 | local (hms-dbmi/pagoda2@NA) |
60 | pcaMethods | 1.60.0 | 2017-11-12 | Bioconductor |
61 | pcaPP | 1.9-73 | 2018-01-14 | cran (@1.9-73) |
62 | pheatmap | 1.0.8 | 2015-12-11 | CRAN (R 3.4.1) |
63 | pkgconfig | 2.0.1 | 2017-03-21 | CRAN (R 3.4.1) |
64 | plyr | 1.8.4 | 2016-06-08 | CRAN (R 3.4.1) |
65 | R6 | 2.2.2 | 2017-06-17 | CRAN (R 3.4.1) |
66 | RColorBrewer | 1.1-2 | 2014-12-07 | CRAN (R 3.4.1) |
67 | Rcpp | 0.12.17 | 2018-05-18 | cran (@0.12.17) |
68 | rjson | 0.2.19 | 2018-05-18 | cran (@0.2.19) |
69 | rlang | 0.1.4 | 2017-11-05 | CRAN (R 3.4.1) |
70 | rmarkdown | 1.9 | 2018-03-01 | CRAN (R 3.4.1) |
71 | Rook | 1.1-1 | 2014-10-20 | CRAN (R 3.4.1) |
72 | rprojroot | 1.3-2 | 2018-01-03 | cran (@1.3-2) |
73 | RSQLite | 2.0 | 2017-06-19 | CRAN (R 3.4.1) |
74 | Rtsne | 0.14 | 2017-11-12 | Github (jkrijthe/Rtsne@1d0f926) |
75 | S4Vectors | 0.8.11 | 2016-01-30 | Bioconductor |
76 | scales | 0.5.0 | 2017-08-24 | CRAN (R 3.4.1) |
77 | sessioninfo | 1.0.0 | 2017-06-21 | CRAN (R 3.4.1) |
80 | stringi | 1.1.7 | 2018-03-12 | cran (@1.1.7) |
81 | stringr | 1.3.0 | 2018-02-19 | cran (@1.3.0) |
82 | tibble | 1.3.4 | 2017-08-22 | CRAN (R 3.4.1) |
84 | triebeard | 0.3.0 | 2016-08-04 | cran (@0.3.0) |
85 | urltools | 1.7.0 | 2018-01-20 | cran (@1.7.0) |
87 | withr | 2.1.2 | 2018-03-15 | cran (@2.1.2) |
88 | yaml | 2.1.18 | 2018-03-08 | cran (@2.1.18) |
This R Markdown site was created with workflowr