Source file: notebooks/umi_correction/umi_bmmc1.Rmd

Last updated: 2018-02-06

Code version: 58021ae

Load data

Link to the original dataset.

library(ggplot2)
library(ggrastr)
library(ggpubr)
library(dplyr)
library(parallel)
library(reshape2)
library(dropestr)
library(dropEstAnalysis)
library(Matrix)

theme_set(theme_base)

kPlotsDir <- '../../output/figures/'
kDatasetName <- 'frozen_bmmc_healthy_donor1'
kDatasetPath <- '../../data/dropest/10x/frozen_bmmc_healthy_donor1/'
kDataPath <- paste0(kDatasetPath, 'est_01_20_umi_quality/')
kData10xPath <- paste0(kDatasetPath, 'est_11_10_umi_quality/')
holder <- readRDS(paste0(kDataPath, 'bmmc.rds'))
if (length(holder$reads_per_umi_per_cell$reads_per_umi[[1]][[1]]) != 2)
  stop("Quality must be provided")

umi_distribution <- GetUmisDistribution(holder$reads_per_umi_per_cell$reads_per_umi)
umi_probs <- umi_distribution / sum(umi_distribution)
collisions_info <- FillCollisionsAdjustmentInfo(umi_probs, max(holder$cm))

UMI correction

# corrected_reads <- list()
# corrected_reads$Bayesian <- holder$reads_per_umi_per_cell %>%
#   CorrectUmiSequenceErrors(method='Bayesian', return='reads',
#                            collisions.info=collisions_info, umi.probabilities=umi_probs,
#                            verbosity.level=2, mc.cores=30)
# 
# corrected_reads$cluster <- holder$reads_per_umi_per_cell %>%
#   CorrectUmiSequenceErrors(method='Classic', return='reads',
#                            collisions.info=collisions_info, umi.probabilities=umi_probs,
#                            verbosity.level=2, mc.cores=30)
# 
# corrected_reads$`cluster-neq` <- holder$reads_per_umi_per_cell %>%
#   CorrectUmiSequenceErrors(method='Classic', return='reads', mult=1+1e-4,
#                            collisions.info=collisions_info, umi.probabilities=umi_probs,
#                            verbosity.level=2, mc.cores=30)
# 
# corrected_reads$directional <- holder$reads_per_umi_per_cell %>%
#   CorrectUmiSequenceErrors(method='Classic', return='reads', mult=2,
#                            collisions.info=collisions_info, umi.probabilities=umi_probs,
#                            verbosity.level=2, mc.cores=30)
# 
# corrected_reads$`no correction` <- holder$reads_per_umi_per_cell$reads_per_umi

# saveRDS(corrected_reads, paste0(kDataPath, 'corrected_rpus.rds'))
corrected_reads <- readRDS(paste0(kDataPath, 'corrected_rpus.rds'))
corrected_cms <- lapply(corrected_reads, BuildCountMatrixFromReads, 
                        reads.per.umi.per.cb.info=holder$reads_per_umi_per_cell, 
                        collisions.info=collisions_info)
corrected_cms <- lapply(corrected_cms, function(cm) cm[grep("^[^;]+$", rownames(cm)), ])
names(corrected_cms) <- c('Bayesian', 'cluster', 'cluster-neq', 'directional', 
                          'no correction')

correction_colors <- c(`CellRanger`="#3b5ddb", Bayesian="#017A5A", cluster="#9B3BB8", 
                       `cluster-neq`="#E69F00", directional="#BD5500", 
                       `no correction`='#757575')

Magnitude of correction

Raw expression

PlotCorrectionSize(corrected_cms, correction_colors) + 
  labs(x = 'Raw expression', y = 'Correction magnitude')

Normalized expression

norm_cms <- lapply(corrected_cms, function(cm) 1000 * t(t(cm) / Matrix::colSums(cm)))
size_supp_fig <- PlotCorrectionSize(norm_cms, correction_colors,
                                    xlim=c(10, 1010), ylim=c(1e-2, 1000), 
                                    dpi=150, width=4, height=2.5) + 
  labs(x = 'Normalized expression', y = 'Correction magnitude')

ggsave(paste0(kPlotsDir, 'supp_bmmc_correction_size.pdf'), size_supp_fig, width=8, height=5)
size_supp_fig

Subset for main figure

gg_correction_size <- norm_cms[c('Bayesian', 'cluster', 'no correction')] %>%
  PlotCorrectionSize(correction_colors, xlim=c(10, 1010), ylim=c(1e-2, 1000), 
                     dpi=150, width=4, height=4, facet=F,
                     mapping=aes(x=`no correction`, y=`no correction`-value, 
                                 color=Correction, alpha=Correction)) + 
  labs(x = 'Normalized expression', y = 'Correction magnitude') +
  scale_alpha_manual(values=c(Bayesian=0.05, cluster=0.02))
gg_correction_size

Edit distances

Comparison of edit distances with the expected distribution, similar to UMI Tools paper.

holder_10x <- readRDS(paste0(kData10xPath, 'bmmc.rds'))
corrected_reads$CellRanger <- holder_10x$reads_per_umi_per_cell$reads_per_umi

Theoretical distribution. Here we use distribution of raw data, but changing it to one of the corrected distributions doesn’t affect the results:

# ed_probs <- sapply(1:500, function(i) SampleNoReps(1000, names(umi_probs), umi_probs) %>% 
#                      PairwiseHamming()) %>% ValueCounts(return_probs=T)
# ed_probs <- ed_probs[paste(1:5)]

ed_probs <- corrected_reads$`no correction` %>% sapply(length) %>%
  mclapply(SampleNoReps, names(umi_probs), umi_probs, mc.cores=20) %>% 
  EditDistanceDistribution(mc.cores=20)

Observed distribution:

umis_per_gene <- mclapply(corrected_reads, lapply, names, mc.cores=6)

obs_ed_probs <- mclapply(umis_per_gene, function(upg) 
  EditDistanceDistribution(upg, mc.cores=8), mc.cores=6) %>% 
  as_tibble()

Figure build:

levels_order <- c('Bayesian', 'CellRanger', 'cluster', 'cluster-neq', 'directional', 
                  'no correction')

plot_df <- (abs(obs_ed_probs - ed_probs) / ed_probs) %>% mutate(EditDistance=1:5) %>% 
  melt(variable.name = 'Correction', value.name = 'Error', id.vars = 'EditDistance')
plot_df$Correction <- factor(as.character(plot_df$Correction), levels=levels_order, ordered=T)

text_df <- data.frame(Prob=ed_probs, EditDistance=1:5, x=1:5 - 0.03) %>%
  mutate(y = plot_df %>% group_by(EditDistance) %>% summarise(Error=max(Error)) %>% 
           .$Error * 100 + 3.5)

breaks <- seq(0, 100, by=25)
gg_eds <- ggplot(plot_df) + 
  geom_bar(aes(x = EditDistance, y = 100 * Error, fill = Correction), color = 'black', 
           position = 'dodge', stat = 'identity') + 
  labs(x = 'Edit distance', y = 'Relative probability error, %') +
  geom_text(aes(x=x, y=y, label=format(Prob, digits=2)), text_df) +
  scale_y_continuous(expand=c(0.0, 0), limits=c(0, 107), minor_breaks=breaks - 1e-3, 
                     breaks=breaks) +
  scale_x_continuous(minor_breaks=NULL) +
  scale_fill_manual(values=correction_colors) +
  theme_pdf(legend.pos=c(1, 1)) +
  theme(panel.grid.major=element_blank())
gg_eds

Main figure, right part

gg_fig <- cowplot::plot_grid(gg_eds, gg_correction_size, nrow=2, 
                             align='v', labels=c('D', 'E'))

saveRDS(list(gg_fig=gg_fig, gg_eds=gg_eds, gg_correction_size=gg_correction_size,
             correction_colors=correction_colors, levels_order=levels_order), 
        '../../data/plot_data/bmmc_umi_fig_part2.rds')
gg_fig

Session information

value
version R version 3.4.1 (2017-06-30)
os Ubuntu 14.04.5 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
tz America/New_York
date 2018-02-06
package loadedversion date source
1 assertthat 0.2.0 2017-04-11 CRAN (R 3.4.1)
2 backports 1.1.2 2017-12-13 CRAN (R 3.4.1)
4 bindr 0.1 2016-11-13 CRAN (R 3.4.1)
5 bindrcpp 0.2 2017-06-17 CRAN (R 3.4.1)
6 Cairo 1.5-9 2015-09-26 CRAN (R 3.4.1)
7 clisymbols 1.2.0 2017-05-21 CRAN (R 3.4.1)
8 colorspace 1.3-2 2016-12-14 CRAN (R 3.4.1)
10 cowplot 0.9.2 2017-12-17 CRAN (R 3.4.1)
12 digest 0.6.14 2018-01-14 cran (@0.6.14)
13 dplyr 0.7.4 2017-09-28 CRAN (R 3.4.1)
14 dropEstAnalysis 0.6.0 2018-02-06 local (VPetukhov/dropEstAnalysis@NA)
15 dropestr 0.7.5 2018-02-05 local (@0.7.5)
16 evaluate 0.10.1 2017-06-24 CRAN (R 3.4.1)
17 ggplot2 2.2.1 2016-12-30 CRAN (R 3.4.1)
18 ggpubr 0.1.6 2017-11-14 CRAN (R 3.4.1)
19 ggrastr 0.1.5 2017-12-28 Github (VPetukhov/ggrastr@cc56b45)
20 git2r 0.21.0 2018-01-04 cran (@0.21.0)
21 glue 1.2.0 2017-10-29 CRAN (R 3.4.1)
25 gtable 0.2.0 2016-02-26 CRAN (R 3.4.1)
26 highr 0.6 2016-05-09 CRAN (R 3.4.1)
27 htmltools 0.3.6 2017-04-28 CRAN (R 3.4.1)
28 knitr 1.18 2017-12-27 cran (@1.18)
29 labeling 0.3 2014-08-23 CRAN (R 3.4.1)
30 lattice 0.20-35 2017-03-25 CRAN (R 3.4.1)
31 lazyeval 0.2.1 2017-10-29 CRAN (R 3.4.1)
32 magrittr 1.5 2014-11-22 CRAN (R 3.4.1)
33 Matrix 1.2-12 2017-11-16 CRAN (R 3.4.1)
35 munsell 0.4.3 2016-02-13 CRAN (R 3.4.1)
37 pkgconfig 2.0.1 2017-03-21 CRAN (R 3.4.1)
38 plyr 1.8.4 2016-06-08 CRAN (R 3.4.1)
39 R6 2.2.2 2017-06-17 CRAN (R 3.4.1)
40 Rcpp 0.12.15 2018-01-20 cran (@0.12.15)
41 reshape2 1.4.3 2017-12-11 CRAN (R 3.4.1)
42 rlang 0.1.4 2017-11-05 CRAN (R 3.4.1)
43 rmarkdown 1.8 2017-11-17 CRAN (R 3.4.1)
44 rprojroot 1.3-2 2018-01-03 cran (@1.3-2)
45 scales 0.5.0 2017-08-24 CRAN (R 3.4.1)
46 sessioninfo 1.0.0 2017-06-21 CRAN (R 3.4.1)
48 stringi 1.1.6 2017-11-17 CRAN (R 3.4.1)
49 stringr 1.2.0 2017-02-18 CRAN (R 3.4.1)
50 tibble 1.3.4 2017-08-22 CRAN (R 3.4.1)
53 withr 2.1.1 2017-12-19 cran (@2.1.1)
54 yaml 2.1.16 2017-12-12 CRAN (R 3.4.1)

This R Markdown site was created with workflowr