Source file: notebooks/umi_correction/trim_aml_035_post.Rmd

Last updated: 2018-02-06

Code version: 58021ae

library(cowplot)
library(ggplot2)
library(ggsci)
library(ggpubr)
library(ggrastr)
library(dplyr)
library(parallel)
library(reshape2)
library(dropestr)
library(dropEstAnalysis)

theme_set(theme_base)
kPlotsFolder <- '../../output/figures/'
kDataPath <- '../../data/dropest/10x/aml035_post_transplant/'
kTrimmedLengths <- 6:9

Data loading

# holders <- mclapply(paste0(kDataPath, c('est_01_21_umi_quality/aml035ost_transplant.rds', 'est_01_21_10x_umi/aml035ost_transplant.rds')), readRDS, mc.cores=2) %>%
#   setNames(c('raw', '10x'))
# 
# reads_per_umi_per_cell <- holders$raw$reads_per_umi_per_cell
# reads_per_umi_per_cell$reads_per_umi <- FilterNUmis(reads_per_umi_per_cell$reads_per_umi)
# 
# saveRDS(reads_per_umi_per_cell, paste0(kDataPath, 'reads_per_umi_per_cell.rds'))

reads_per_umi_per_cell <- readRDS(paste0(kDataPath, 'reads_per_umi_per_cell.rds'))
# reads_per_umi_per_cell$reads_per_umi <- reads_per_umi_per_cell$reads_per_umi[1:50000]
# raw <- readRDS(paste0(kDataPath, 'raw.rds'))

length(reads_per_umi_per_cell$reads_per_umi)
[1] 722905

Trimming

# # WARNING: it requires a lot of memory and CPU. Because of weird R multiprocessing, for me it took up to 650Gb RAM for 30 cores.
# trimmed <- mclapply(kTrimmedLengths, function(i) 
#   TrimAndCorrect(reads_per_umi_per_cell, umi.trim.length=i, mc.cores.large=7, 
#                  mc.cores.small=7, verbosity.level=1), mc.cores=4)
# 
# raw <- list()
# 
# raw$umi_distribution <- GetUmisDistribution(reads_per_umi_per_cell$reads_per_umi, smooth=10)
# raw$umi_probabilities <- raw$umi_distribution / sum(raw$umi_distribution)
# 
# max_umi_per_gene <- sapply(reads_per_umi_per_cell$reads_per_umi, length) %>% max()
# raw$collisions_info <- FillCollisionsAdjustmentInfo(raw$umi_probabilities, max_umi_per_gene)
# raw$correction_info <- PrepareUmiCorrectionInfo(umi.probabilities=raw$umi_probabilities, 
#                                                 raw$collisions_info[max_umi_per_gene])
# 
# raw$filt_reads_per_umi_simple <- reads_per_umi_per_cell %>% 
#   CorrectUmiSequenceErrors(raw$umi_probabilities, method='Classic', 
#                            correction.info=raw$correction_info, 
#                            collisions.info=raw$collisions_info, 
#                            mc.cores=30, verbosity.level=2, return='reads')
# 
# raw$filt_umis_per_gene_simple <- sapply(raw$filt_reads_per_umi_simple, length)
# 
# raw$umis_per_gene <- sapply(reads_per_umi_per_cell$reads_per_umi, length)
# 
# saveRDS(raw, paste0(kDataPath, 'raw5.rds'))
# saveRDS(trimmed, paste0(kDataPath, 'trimmed5.rds'))
raw <- readRDS(paste0(kDataPath, 'raw5.rds'))
trimmed <- readRDS(paste0(kDataPath, 'trimmed5.rds'))

Plots

Supp. Figure Collisions

umis_per_gene_df <- as.data.frame(lapply(trimmed[1:4], `[[`, 'umis.per.gene'))
umis_per_gene_df$raw <- raw$umis_per_gene
colnames(umis_per_gene_df)[1:4] <- paste0(kTrimmedLengths)

umis_per_gene_adj_df <- trimmed[1:4] %>%
  lapply(function(tr) tr$collisions.info[tr$umis.per.gene]) %>% as.data.frame()
colnames(umis_per_gene_adj_df) <- paste0(kTrimmedLengths)

umis_per_gene_adj_classic_df <- mclapply(kTrimmedLengths, function(i) sapply(
  trimmed[[i - 5]]$umis.per.gene, AdjustGeneExpressionUniform, 4^i), mc.cores=5) %>%
  as.data.frame()
colnames(umis_per_gene_adj_classic_df) <- paste0(kTrimmedLengths)

plot_df <- melt(umis_per_gene_df, id.vars='raw', variable.name='UmiLen', 
                value.name='NoAdjustment')
plot_df$Adjusted <- melt(umis_per_gene_adj_df)$value
plot_df$AdjustedClassic <- melt(umis_per_gene_adj_classic_df)$value

plot_df <- melt(plot_df, id.vars=c('raw', 'UmiLen'), variable.name='Adjustment') %>%
  filter(raw > 5)
plot_dfs <- split(plot_df, plot_df$UmiLen)
scale_y_low <- scale_y_continuous(limits=c(-3000, 0), breaks=seq(0, -2500, by=-2500), expand=c(0, 0))

ggs <- lapply(names(plot_dfs), function(n) 
  ggplot(plot_dfs[[n]], aes(x=raw, y=(value - raw), linetype=Adjustment, col=UmiLen)) + 
    geom_smooth() + 
    scale_color_manual(values=scales::hue_pal()(4)[as.integer(n) - 5]) +
    scale_linetype_manual(labels=c('No adjustment', 'Empirical', 'Uniform'), 
                          values=c('solid', 'dashed', 'dotted')) +
    scale_x_continuous(expand=c(0, 0), limits=c(0, 12300), 
                       breaks=seq(0, 10000, 2500)) + 
    scale_y_continuous(expand=c(0, 0), limits=c(-10000, 0)) +
    theme_pdf())

ggs[3:4] <- lapply(ggs[3:4], `+`, scale_y_low)
ggs[[1]] <- ggs[[1]] + theme_pdf(legend.pos=c(0, 0)) + theme(legend.box='horizontal') +
  scale_color_manual(values=alpha(scales::hue_pal()(4), 0.7), name='Length of\ntrimmed UMI', 
                     labels=names(plot_dfs), drop=F) +
  guides(color=guide_legend(ncol=2, nrow=2, byrow=T))

ggs <- BuildPanel4(ggs, "", "", return.raw=T, legend.plot.id=1)

ga1 <- plot_grid(plotlist=ggs[c(1,3)], nrow=2, rel_heights=c(2.5, 1), align='v')
ga2 <- plot_grid(plotlist=ggs[c(2,4)], nrow=2, rel_heights=c(2.5, 1), align='v')

supp_fig_collisions <- annotate_figure(ggarrange(ga1, ga2, ncol=2), 
                                 bottom=text_grob("#Molecules after trimming", size=14), 
                                 left = text_grob("Error after trimming (#molecules)", 
                                                  rot=90, size=14))

ggsave(paste0(kPlotsFolder, 'supp_umi_trim_collisions.pdf'), supp_fig_collisions,
       height=4, width=8)
supp_fig_collisions

Supp. Figure Corrections

plots <- lapply(kTrimmedLengths, function(i) 
  PlotTrimmedCorrections(trimmed[[i - 5]], raw, i, log=T, rast.width=4,
                         rast.height=4.5, rast.dpi=150, heights.ratio=c(2.5, 3)))
large_plots <- lapply(plots, `[[`, 'large') %>% lapply(`+`, annotation_logticks(sides='b'))
small_plots <- lapply(plots, `[[`, 'small')

title_theme <- theme(plot.title=element_text(size=12))
gp_large <- BuildPanel4(large_plots, xlabel="Corrected #UMIs without trimming", 
                        ylabel="Error on trimmed data, %", legend.plot.id=3, labels=NULL, 
                        plot.theme=title_theme)
gp_small <- BuildPanel4(small_plots, xlabel="Corrected #UMIs without trimming", 
                        ylabel="Error on trimmed data, %", labels=NULL, 
                        plot.theme=title_theme)
supp_fig_errors <- plot_grid(gp_large, gp_small, nrow=2, labels=c('A', 'B'), 
                             rel_heights=c(5, 6))
supp_fig_errors

ggsave(paste0(kPlotsFolder, 'supp_umi_trim_error.pdf'), supp_fig_errors, width=8, height=9)

Main figure

umis_per_gene <- as.data.frame(lapply(trimmed, `[[`, 'umis.per.gene'))
colnames(umis_per_gene) <- paste0(kTrimmedLengths)
errors <- ((umis_per_gene - raw$filt_umis_per_gene_simple) / raw$filt_umis_per_gene_simple) %>%
  cbind(Real=raw$filt_umis_per_gene_simple) %>% 
  melt(id.vars='Real', variable.name='UmiLength', value.name='Error')

errors_sum <- errors %>% group_by(Real, UmiLength) %>% summarise(Error=mean(Error, trim=0.2))

Figure A

kPlotWidth <- 8
kPlotHeight <- 7
gp1 <- ggplot(errors_sum, aes(x=Real, y=100 * Error, color=UmiLength)) +
  geom_point_rast(size=0.3, alpha=0.5, width=kPlotWidth / 2, height=kPlotHeight / 3) +
  geom_smooth() +
  scale_x_log10(expand=c(0.01, 0)) + annotation_logticks(sides='b') + 
  scale_y_continuous(limits=c(-65, 60), expand=c(0, 0)) +
  theme_pdf(legend.pos=c(0, 0)) +
  scale_color_npg() +
  guides(color=guide_legend(title='Length of trimmed UMI', ncol=4))

gp1

Figure B

fig_umi_length <- 6
trimmed_corrected_cells <- lapply(raw$filt_reads_per_umi_simple, TrimUmis, fig_umi_length)
umis_per_gene_plt <- data.frame(none=sapply(trimmed_corrected_cells, length), 
                                real=raw$filt_umis_per_gene_simple)

umis_per_gene_plt$empirical <- trimmed[[fig_umi_length - 5]]$collisions.info[umis_per_gene_plt$none]
umis_per_gene_plt$uniform <- sapply(umis_per_gene_plt$none, AdjustGeneExpressionUniform, 
                                    4^fig_umi_length)
umis_per_gene_plt_df <- melt(as.data.frame(umis_per_gene_plt), id.vars='real', 
                             variable.name='Adjustment', value.name='value')

umis_per_gene_plt_df <- umis_per_gene_plt_df %>% group_by(Adjustment, real) %>% 
  summarize(value=mean(value, 0.2))

gp2 <- umis_per_gene_plt_df %>%
  ggplot(aes(x=real, y=100 * (value - real) / real, col=Adjustment)) +
  geom_point_rast(size=0.3, alpha=0.3, width=kPlotWidth / 2, height=kPlotHeight / 3) +
  geom_smooth() +
  scale_x_log10(expand=c(0.01, 0)) + annotation_logticks(sides='b') +
  scale_y_continuous(expand=c(0, 0), limits=c(-65, 11)) +
  scale_color_nejm() +
  labs(x='Corrected #molecules without trimming', y='Error, %') +
  guides(color = guide_legend(title='Collision adjustment', ncol=3)) +
  theme_pdf(legend.pos=c(0, 0))

gp2

Figure C

fig_umi_length <- 7
corrected_data <- as_tibble(trimmed[[fig_umi_length - 5]]$filt_cells)
colnames(corrected_data) <- c('Bayesian', 'cluster', 'cluster-neq', 'directional')
corrected_data$`no correction` <- trimmed[[fig_umi_length - 5]]$umis.per.gene
corrected_data$real <- raw$filt_umis_per_gene_simple

plot_df <- melt(corrected_data, id.vars='real', variable.name='Correction') %>%
  group_by(Correction, real) %>% summarize(value=median(value))

gp3 <- ggplot(plot_df, aes(x=real, y=100 * (value - real) / real, color=Correction)) +
  geom_point_rast(size=0.3, alpha=0.3, width=kPlotWidth / 2, height=kPlotHeight / 3) +
  geom_smooth() +
  scale_x_log10(expand=c(0.01, 0)) + annotation_logticks(sides='b') +
  scale_y_continuous(limits=c(-101, 51), expand=c(0, 0)) +
  # scale_color_manual(values=correction_colors[2:length(correction_colors)]) +
  guides(color=guide_legend(ncol=3)) + theme_pdf(legend.pos=c(0, 0))

gp3

aml035_plots <- list(gp1=gp1, gp2=gp2, gp3=gp3)
# You need to run "umi_correction/umi_bmmc1.Rmd" first to produce these plots
bmmc_plots_data <- readRDS('../../data/plot_data/bmmc_umi_fig_part2.rds')

Complete figure

colors <- alpha(bmmc_plots_data$correction_colors, 0.7) %>% 
  setNames(names(bmmc_plots_data$correction_colors))
scale_color_short <- scale_color_manual(values=colors)
figure_trim <- plot_grid(
  aml035_plots$gp1 + theme_pdf() + ylab('Error, %') + rremove("xlab") + 
    theme(plot.margin=margin(b=2, unit='pt')),
  aml035_plots$gp2 + theme_pdf() + rremove("xlab") + 
    theme(plot.margin=margin(b=2, unit='pt')),
  aml035_plots$gp3 + theme_pdf() + ylab('Error, %') + scale_color_short +
    guides(color=guide_legend(title='Correction (see Figure D)', 
                              label.theme=element_blank(), ncol=5, reverse=T)) +
    rremove("xlab") + theme(plot.margin=margin()),
  ncol=1, nrow=3, labels = c("A", "B", "C"), align='v')

figure_trim_ann <- (figure_trim + theme(plot.margin=margin(t=10, l=0, unit='pt'))) %>%
  annotate_figure(bottom=text_grob("Gene expression magnitude (#molecules)", size=14, 
                                   hjust=0.43))
fig_full <- plot_grid(figure_trim_ann + theme(plot.margin=margin()), 
                      bmmc_plots_data$gg_fig + theme(plot.margin=margin()), ncol=2)
fig_full_ann <- annotate_figure(fig_full, fig.lab="Figure 2", fig.lab.pos="top.right")
fig_full_ann

ggsave(paste0(kPlotsFolder, 'umi_correction_figure.pdf'), fig_full_ann, width=8, height=7)

Session information

value
version R version 3.4.1 (2017-06-30)
os Ubuntu 14.04.5 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
tz America/New_York
date 2018-02-06
package loadedversion date source
1 assertthat 0.2.0 2017-04-11 CRAN (R 3.4.1)
2 backports 1.1.2 2017-12-13 CRAN (R 3.4.1)
4 bindr 0.1 2016-11-13 CRAN (R 3.4.1)
5 bindrcpp 0.2 2017-06-17 CRAN (R 3.4.1)
6 Cairo 1.5-9 2015-09-26 CRAN (R 3.4.1)
7 clisymbols 1.2.0 2017-05-21 CRAN (R 3.4.1)
8 colorspace 1.3-2 2016-12-14 CRAN (R 3.4.1)
10 cowplot 0.9.2 2017-12-17 CRAN (R 3.4.1)
12 digest 0.6.14 2018-01-14 cran (@0.6.14)
13 dplyr 0.7.4 2017-09-28 CRAN (R 3.4.1)
14 dropEstAnalysis 0.6.0 2018-02-06 local (VPetukhov/dropEstAnalysis@NA)
15 dropestr 0.7.5 2018-02-05 local (@0.7.5)
16 evaluate 0.10.1 2017-06-24 CRAN (R 3.4.1)
17 ggplot2 2.2.1 2016-12-30 CRAN (R 3.4.1)
18 ggpubr 0.1.6 2017-11-14 CRAN (R 3.4.1)
19 ggrastr 0.1.5 2017-12-28 Github (VPetukhov/ggrastr@cc56b45)
20 ggsci 2.8 2017-09-30 CRAN (R 3.4.1)
21 git2r 0.21.0 2018-01-04 cran (@0.21.0)
22 glue 1.2.0 2017-10-29 CRAN (R 3.4.1)
26 gridExtra 2.3 2017-09-09 CRAN (R 3.4.1)
27 gtable 0.2.0 2016-02-26 CRAN (R 3.4.1)
28 highr 0.6 2016-05-09 CRAN (R 3.4.1)
29 htmltools 0.3.6 2017-04-28 CRAN (R 3.4.1)
30 knitr 1.18 2017-12-27 cran (@1.18)
31 labeling 0.3 2014-08-23 CRAN (R 3.4.1)
32 lattice 0.20-35 2017-03-25 CRAN (R 3.4.1)
33 lazyeval 0.2.1 2017-10-29 CRAN (R 3.4.1)
34 magrittr 1.5 2014-11-22 CRAN (R 3.4.1)
35 Matrix 1.2-12 2017-11-16 CRAN (R 3.4.1)
37 mgcv 1.8-22 2017-09-19 CRAN (R 3.4.1)
38 munsell 0.4.3 2016-02-13 CRAN (R 3.4.1)
39 nlme 3.1-131 2017-02-06 CRAN (R 3.4.0)
41 pkgconfig 2.0.1 2017-03-21 CRAN (R 3.4.1)
42 plyr 1.8.4 2016-06-08 CRAN (R 3.4.1)
43 purrr 0.2.4 2017-10-18 CRAN (R 3.4.1)
44 R6 2.2.2 2017-06-17 CRAN (R 3.4.1)
45 Rcpp 0.12.15 2018-01-20 cran (@0.12.15)
46 reshape2 1.4.3 2017-12-11 CRAN (R 3.4.1)
47 rlang 0.1.4 2017-11-05 CRAN (R 3.4.1)
48 rmarkdown 1.8 2017-11-17 CRAN (R 3.4.1)
49 rprojroot 1.3-2 2018-01-03 cran (@1.3-2)
50 scales 0.5.0 2017-08-24 CRAN (R 3.4.1)
51 sessioninfo 1.0.0 2017-06-21 CRAN (R 3.4.1)
53 stringi 1.1.6 2017-11-17 CRAN (R 3.4.1)
54 stringr 1.2.0 2017-02-18 CRAN (R 3.4.1)
55 tibble 1.3.4 2017-08-22 CRAN (R 3.4.1)
58 withr 2.1.1 2017-12-19 cran (@2.1.1)
59 yaml 2.1.16 2017-12-12 CRAN (R 3.4.1)

This R Markdown site was created with workflowr