Source file: notebooks/annotation/annotation_pbmc8k.Rmd
Last updated: 2018-05-29
Code version: 3ea6d7c
holder <- readRDS(paste0(kEstFolder, 'pbmc8k_no_umi.rds'))
genes <- read.table(paste0(k10xFolder, 'genes.tsv')) %>%
filter(V2 %in% names(which(table(V2) == 1)))
gene_id_to_names <- setNames(genes$V2, genes$V1)
holder$cm_raw <- holder$cm_raw[grep("^[^;]+$", rownames(holder$cm_raw)),]
umis_per_cell <- sort(Matrix::colSums(holder$cm_raw), decreasing=T)
est_cell_num <- EstimateCellsNumber(umis_per_cell)
scores <- ScorePipelineCells(holder, mit.chromosome.name='MT',
predict.all=T)[names(umis_per_cell)]
PlotCellScores(scores, cells.number=est_cell_num)
Pagoda run:
real_cbs <- names(scores)[1:est_cell_num$expected]
real_cbs <- real_cbs[scores[real_cbs] > 0.9]
r_cm <- holder$cm_raw[, real_cbs]
r_cm <- r_cm[intersect(rownames(r_cm), names(gene_id_to_names)), ]
rownames(r_cm) <- gene_id_to_names[rownames(r_cm)]
pgd <- GetPagoda(r_cm, n.cores=30)
8451 cells, 12931 genes; normalizing ... using plain model winsorizing ... log scale ... done.
calculating variance fit ... using gam 485 overdispersed genes ... 485 persisting ... done.
running PCA using 1000 OD genes .... done
calculating distance ... pearson ...running tSNE using 30 cores:
# clusters <- pgd$clusters$PCA$infomap
# write.csv(clusters, paste0(kAnnotationData, 'pbmc8k_clusters.csv'))
# Pagoda uses stochastic clustering algorithm, so we saved clusters from one run
clusters <- read.csv(paste0(kAnnotationData, 'pbmc8k_clusters.csv'), row.names=1)
clusters <- setNames(clusters$x, rownames(clusters))
log_mtx <- log10(1e-3 + as.matrix(pgd$counts[names(clusters), ]))
Initial clustering:
PlotPagodaEmbeding(pgd, clusters=clusters, show.ticks=F)
de_genes <- pgd$getDifferentialGenes(type='PCA', groups=clusters,
upregulated.only=T) %>% lapply(rownames)
major_cell_types <- lst(
`T cells` = sapply(de_genes, function(genes) 'CD3D' %in% genes) %>%
which() %>% names() %>% as.integer(),
`B cells` = sapply(de_genes, function(genes) 'MS4A1' %in% genes) %>%
which() %>% names() %>% as.integer()
)
major_type_clusts <- major_cell_types %>% unlist()
if (length(major_type_clusts) != length(unique(major_type_clusts)))
stop("Something goes wrong")
heatmap_genes <- c(
'MS4A1',
'CD3D', 'CD3E',
'LYZ', 'CD14',
'GNLY', 'NKG7',
"GZMA", "GZMB", "GZMH", "GZMK",
'FCGR3A', 'MS4A7',
'FCER1A', 'CST3',
'PPBP')
heatmap_clusters <- clusters[!(clusters %in% unlist(major_cell_types))]
# heatmap_clusters <- heatmap_clusters[heatmap_clusters > 20]
PlotExpressionHeatmap(log_mtx, heatmap_clusters, heatmap_genes)
type_ids <- c(major_cell_types, lst(
`CD14+ Monocytes` = c(1, 12, 13, 34),
`NK cells 1` = c(9, 29, 30),
`NK cells 2` = 26,
`FCGR3A+ Monocytes` = 15,
`Dendritic cells` = c(18, 24),
`Megakaryocytes` = 31
))
type_ids$`T cells` <- c(type_ids$`T cells`, 35)
markers_df <- data.frame(
Type = c(
"B cells", "T cells", "CD14+ Monocytes", "NK cells 1", "NK cells 2",
"FCGR3A+ Monocytes", "Dendritic cells", "Megakaryocytes"),
Markers = c(
"MS4A1", "CD3D, CD3E", "LYZ, CD14", "GNLY, NKG7", "GZMB, NKG7",
"FCGR3A, MS4A7", "FCER1A, CST3", "PPBP")
)
markers_df$Clusters <- sapply(type_ids, paste, collapse=", ")[as.character(markers_df$Type)]
markers_df
Type | Markers | Clusters |
---|---|---|
B cells | MS4A1 | 4, 8, 19, 22, 23, 28, 29, 33 |
T cells | CD3D, CD3E | 2, 3, 5, 6, 7, 10, 11, 14, 16, 17, 20, 21, 25, 27, 32, 35 |
CD14+ Monocytes | LYZ, CD14 | 1, 12, 13, 34 |
NK cells 1 | GNLY, NKG7 | 9, 29, 30 |
NK cells 2 | GZMB, NKG7 | 26 |
FCGR3A+ Monocytes | FCGR3A, MS4A7 | 15 |
Dendritic cells | FCER1A, CST3 | 18, 24 |
Megakaryocytes | PPBP | 31 |
clusters_annotated <- AnnotateClusters(clusters, type_ids)
PlotClustering(pgd, clusters_annotated)
heatmap_genes <- c(
"CD3D", "CD3E", "CCR7",
"CD8A", "CD8B",
"IL7R", "CD4",
"NKG7", "GZMA", "GZMH", "GZMK"
)
heatmap_clusters <- clusters[clusters %in% type_ids$`T cells`]
# heatmap_clusters <- heatmap_clusters[heatmap_clusters > 3]
PlotExpressionHeatmap(log_mtx, heatmap_clusters, heatmap_genes)
t_markers_df <- data.frame(
Type = c("Naive T cells", "CD8 T cells", "CD4+ T cells", "Cytotoxic T cells"),
Markers = c("CCR7", "CD8A, CD8B", "IL7R, CD4", "NKG7, GZMA, GZMH, GZMK")
)
type_ids <- c(type_ids, lst(
`Naive T cells` = c(2, 7, 21),
`CD8 T cells` = c(3),
`CD4+ T cells` = c(5, 10, 14, 35),
`Cytotoxic T cells` = c(6, 11, 16, 17, 20, 25, 27, 32)
))
type_ids$`T cells` <- NULL
t_markers_df$Clusters <- sapply(type_ids, paste, collapse=", ")[as.character(t_markers_df$Type)]
t_markers_df
Type | Markers | Clusters |
---|---|---|
Naive T cells | CCR7 | 2, 7, 21 |
CD8 T cells | CD8A, CD8B | 3 |
CD4+ T cells | IL7R, CD4 | 5, 10, 14, 35 |
Cytotoxic T cells | NKG7, GZMA, GZMH, GZMK | 6, 11, 16, 17, 20, 25, 27, 32 |
clusters_annotated <- AnnotateClusters(clusters, type_ids)
write.csv(data.frame(Barcode=names(clusters_annotated),
Type=as.vector(clusters_annotated)),
paste0(kAnnotationData, 'pbmc8k_clusters_annotated.csv'))
all_markers_df <- rbind(markers_df, t_markers_df)
write.csv(all_markers_df[c('Type', 'Markers')],
paste0(kOutputPath, 'tables/annotation_pbmc8k_markers.csv'), row.names=F)
all_markers_df
Type | Markers | Clusters |
---|---|---|
B cells | MS4A1 | 4, 8, 19, 22, 23, 28, 29, 33 |
T cells | CD3D, CD3E | 2, 3, 5, 6, 7, 10, 11, 14, 16, 17, 20, 21, 25, 27, 32, 35 |
CD14+ Monocytes | LYZ, CD14 | 1, 12, 13, 34 |
NK cells 1 | GNLY, NKG7 | 9, 29, 30 |
NK cells 2 | GZMB, NKG7 | 26 |
FCGR3A+ Monocytes | FCGR3A, MS4A7 | 15 |
Dendritic cells | FCER1A, CST3 | 18, 24 |
Megakaryocytes | PPBP | 31 |
Naive T cells | CCR7 | 2, 7, 21 |
CD8 T cells | CD8A, CD8B | 3 |
CD4+ T cells | IL7R, CD4 | 5, 10, 14, 35 |
Cytotoxic T cells | NKG7, GZMA, GZMH, GZMK | 6, 11, 16, 17, 20, 25, 27, 32 |
raster_width <- 8 / 3
raster_height <- 8 / 4
raster_dpi <- 150
long_type_names <- c("CD14+ Monocytes", "FCGR3A+ Monocytes", "Dendritic cells",
"Naive T cells", "Cytotoxic T cells", "CD4+ T cells")
for (type in long_type_names) {
clusters_annotated[clusters_annotated == type] <- sub(" ", "\n", type)
}
gg_annotation <- PlotClustering(pgd, clusters_annotated, lineheight=0.7, size=0.2,
raster=T, raster.width=raster_width,
raster.height=raster_height, raster.dpi=raster_dpi) +
theme_pdf(show.ticks=F) + ggpubr::rremove("xylab") +
scale_size_continuous(range=c(3, 3)) +
theme(plot.margin=margin())
plot_mtx <- apply(log_mtx, 2, function(vec) scales::rescale(rank(vec)))
plot_genes <- c('MS4A1', 'LYZ',
'FCGR3A', 'FCER1A', 'PPBP',
'CD3D', 'CCR7', 'CD8A', 'IL7R',
'NKG7', 'GZMB')
gene_plots <- lapply(plot_genes, PlotGeneFraction, pgd, plot_mtx, title.x=0.04,
title.y=0.99, legend.position="none", size=0.2,
raster=T, raster.width=raster_width,
raster.height=raster_height, raster.dpi=raster_dpi)
gene_plots <- c(list(gg_annotation), gene_plots)
gg_fig <- cowplot::plot_grid(plotlist=gene_plots, ncol=3) +
theme(plot.margin=margin(1, 1, 1, 1))
gg_fig
ggsave(paste0(kOutputPath, 'figures/supp_annotation_pbmc8k.pdf'), width=8, height=8)
data.frame(value=unlist(sessioninfo::platform_info()))
value | |
---|---|
version | R version 3.4.1 (2017-06-30) |
os | Ubuntu 14.04.5 LTS |
system | x86_64, linux-gnu |
ui | X11 |
language | (EN) |
collate | en_US.UTF-8 |
tz | America/New_York |
date | 2018-05-29 |
as.data.frame(sessioninfo::package_info())[c('package', 'loadedversion', 'date', 'source')]
package | loadedversion | date | source | |
---|---|---|---|---|
1 | AnnotationDbi | 1.32.3 | 2016-01-28 | Bioconductor |
2 | assertthat | 0.2.0 | 2017-04-11 | CRAN (R 3.4.1) |
3 | backports | 1.1.2 | 2017-12-13 | CRAN (R 3.4.1) |
5 | base64enc | 0.1-3 | 2015-07-28 | cran (@0.1-3) |
6 | bindr | 0.1 | 2016-11-13 | CRAN (R 3.4.1) |
7 | bindrcpp | 0.2 | 2017-06-17 | CRAN (R 3.4.1) |
8 | Biobase | 2.30.0 | 2016-01-28 | Bioconductor |
9 | BiocGenerics | 0.16.1 | 2016-01-28 | Bioconductor |
10 | bit | 1.1-12 | 2014-04-09 | CRAN (R 3.4.1) |
11 | bit64 | 0.9-7 | 2017-05-08 | CRAN (R 3.4.1) |
12 | blob | 1.1.0 | 2017-06-17 | CRAN (R 3.4.1) |
13 | brew | 1.0-6 | 2011-04-13 | CRAN (R 3.4.1) |
14 | Cairo | 1.5-9 | 2015-09-26 | CRAN (R 3.4.1) |
15 | clisymbols | 1.2.0 | 2017-05-21 | CRAN (R 3.4.1) |
16 | colorspace | 1.3-2 | 2016-12-14 | CRAN (R 3.4.1) |
18 | cowplot | 0.9.2 | 2017-12-17 | CRAN (R 3.4.1) |
20 | DBI | 1.0.0 | 2018-05-02 | CRAN (R 3.4.1) |
21 | dendsort | 0.3.3 | 2015-12-14 | cran (@0.3.3) |
22 | digest | 0.6.15 | 2018-01-28 | cran (@0.6.15) |
23 | dplyr | 0.7.4 | 2017-09-28 | CRAN (R 3.4.1) |
24 | dropEstAnalysis | 0.6.0 | 2018-05-16 | local (VPetukhov/dropEstAnalysis@NA) |
25 | dropestr | 0.7.7 | 2018-03-17 | local (@0.7.7) |
26 | evaluate | 0.10.1 | 2017-06-24 | CRAN (R 3.4.1) |
27 | ggplot2 | 2.2.1 | 2016-12-30 | CRAN (R 3.4.1) |
28 | ggpubr | 0.1.6 | 2017-11-14 | CRAN (R 3.4.1) |
29 | ggrastr | 0.1.5 | 2017-12-28 | Github (VPetukhov/ggrastr@cc56b45) |
30 | ggrepel | 0.7.0 | 2017-09-29 | CRAN (R 3.4.1) |
31 | git2r | 0.21.0 | 2018-01-04 | cran (@0.21.0) |
32 | glue | 1.2.0 | 2017-10-29 | CRAN (R 3.4.1) |
33 | GO.db | 3.2.2 | 2017-11-12 | Bioconductor |
37 | gtable | 0.2.0 | 2016-02-26 | CRAN (R 3.4.1) |
38 | highr | 0.6 | 2016-05-09 | CRAN (R 3.4.1) |
39 | htmltools | 0.3.6 | 2017-04-28 | CRAN (R 3.4.1) |
40 | igraph | 1.2.1 | 2018-03-10 | cran (@1.2.1) |
41 | IRanges | 2.4.8 | 2016-09-15 | Bioconductor |
42 | irlba | 2.3.2 | 2018-01-11 | cran (@2.3.2) |
43 | KernSmooth | 2.23-15 | 2015-06-29 | CRAN (R 3.4.0) |
44 | knitr | 1.20 | 2018-02-20 | cran (@1.20) |
45 | ks | 1.11.0 | 2018-01-16 | local (VPetukhov/ks@NA) |
46 | labeling | 0.3 | 2014-08-23 | CRAN (R 3.4.1) |
47 | lattice | 0.20-35 | 2017-03-25 | CRAN (R 3.4.1) |
48 | lazyeval | 0.2.1 | 2017-10-29 | CRAN (R 3.4.1) |
49 | magrittr | 1.5 | 2014-11-22 | CRAN (R 3.4.1) |
50 | MASS | 7.3-47 | 2017-04-21 | CRAN (R 3.4.0) |
51 | Matrix | 1.2-12 | 2017-11-16 | CRAN (R 3.4.1) |
52 | mclust | 5.4 | 2017-11-22 | CRAN (R 3.4.1) |
53 | memoise | 1.1.0 | 2017-04-21 | CRAN (R 3.4.1) |
55 | mgcv | 1.8-22 | 2017-09-19 | CRAN (R 3.4.1) |
56 | munsell | 0.4.3 | 2016-02-13 | CRAN (R 3.4.1) |
57 | mvtnorm | 1.0-7 | 2018-01-26 | cran (@1.0-7) |
58 | nlme | 3.1-131 | 2017-02-06 | CRAN (R 3.4.0) |
59 | pagoda2 | 0.0.0.9002 | 2018-04-08 | local (hms-dbmi/pagoda2@NA) |
61 | pcaMethods | 1.60.0 | 2017-11-12 | Bioconductor |
62 | pcaPP | 1.9-73 | 2018-01-14 | cran (@1.9-73) |
63 | pheatmap | 1.0.8 | 2015-12-11 | CRAN (R 3.4.1) |
64 | pkgconfig | 2.0.1 | 2017-03-21 | CRAN (R 3.4.1) |
65 | plyr | 1.8.4 | 2016-06-08 | CRAN (R 3.4.1) |
66 | R6 | 2.2.2 | 2017-06-17 | CRAN (R 3.4.1) |
67 | RColorBrewer | 1.1-2 | 2014-12-07 | CRAN (R 3.4.1) |
68 | Rcpp | 0.12.17 | 2018-05-18 | cran (@0.12.17) |
69 | rjson | 0.2.19 | 2018-05-18 | cran (@0.2.19) |
70 | rlang | 0.1.4 | 2017-11-05 | CRAN (R 3.4.1) |
71 | rmarkdown | 1.9 | 2018-03-01 | CRAN (R 3.4.1) |
72 | Rook | 1.1-1 | 2014-10-20 | CRAN (R 3.4.1) |
73 | rprojroot | 1.3-2 | 2018-01-03 | cran (@1.3-2) |
74 | RSQLite | 2.0 | 2017-06-19 | CRAN (R 3.4.1) |
75 | Rtsne | 0.14 | 2017-11-12 | Github (jkrijthe/Rtsne@1d0f926) |
76 | S4Vectors | 0.8.11 | 2016-01-30 | Bioconductor |
77 | scales | 0.5.0 | 2017-08-24 | CRAN (R 3.4.1) |
78 | sessioninfo | 1.0.0 | 2017-06-21 | CRAN (R 3.4.1) |
81 | stringi | 1.1.7 | 2018-03-12 | cran (@1.1.7) |
82 | stringr | 1.3.0 | 2018-02-19 | cran (@1.3.0) |
83 | tibble | 1.3.4 | 2017-08-22 | CRAN (R 3.4.1) |
85 | triebeard | 0.3.0 | 2016-08-04 | cran (@0.3.0) |
86 | urltools | 1.7.0 | 2018-01-20 | cran (@1.7.0) |
88 | withr | 2.1.2 | 2018-03-15 | cran (@2.1.2) |
89 | yaml | 2.1.18 | 2018-03-08 | cran (@2.1.18) |
This R Markdown site was created with workflowr