Source file: notebooks/low_quality_cells/label_noise_robustness.Rmd

Last updated: 2018-02-02

Code version: b8b2fd2

Initialization

library(ggplot2)
library(ggrastr)
library(parallel)
library(dplyr)
library(dropestr)
library(dropEstAnalysis)
library(reshape2)
library(randomForest)
library(ggpubr)
library(cowplot)

theme_set(theme_base)

RgpPredict <- function(clf, x) {
  predictions <- predictORMCGP(clf, x)$prob[,2]
  names(predictions) <- rownames(x)
  return(predictions)
}

classifier_funcs <- list(
  KDE=list(train=TrainKDE, predict=function(clf, y) PredictKDE(clf, y)[, 2]), 
  RF=list(train=function(x, y) randomForest(x=x, y=as.factor(y), ntree=1000, mtry=2), 
          predict=function(clf, x) predict(clf, x, type='prob')[,2]),
  RGP=list(train=function(x, y) epORGPC(x, as.factor(y), kernel='gaussian'), 
           predict=RgpPredict))
kPlotDir <- '../../output/figures/'
holder <- readRDS('../../data/dropest/allon_new/SRR3879617/est_11_22/cell.counts.rds')
holder$reads_per_umi_per_cell <- NULL

Number of cells:

analised_cbs <- names(sort(holder$aligned_umis_per_cell, decreasing=T)[1:1300])
umi_counts <- holder$aligned_umis_per_cell[analised_cbs]

features <- PrepareLqCellsDataPipeline(holder, mit.chromosome.name="chrM")[analised_cbs,]
classifier_data <- GetOptimalPcs(features)$pca.data
cell_number <- EstimateCellsNumber(umi_counts)

real_cbs <- analised_cbs[1:cell_number$min]
background_cbs <- analised_cbs[cell_number$max:length(analised_cbs)]

scores_base <- mclapply(classifier_funcs, ScoringFunction, classifier_data, 
                        real_cbs, background_cbs, mc.cores=3)

PlotCellsNumberLine(umi_counts, estimate.cells.number=T, breaks=50) +
  theme_pdf()

Miochondrial fraction:

smoothScatter(features$MitochondrionFraction, xlab='Cell rank', 
              ylab='Mitochondrion fraction', cex.lab=1.2)

CV on labeled data

Simple cross-validation on labeled data. While, labels have errors, it can give us an estimate of the algorithms’ quality.

kfold_labeled_res <- lapply(classifier_funcs, function(clf)
  KFoldCV(classifier_data[c(real_cbs, background_cbs),], 
          c(rep(1, length(real_cbs)), rep(0, length(background_cbs))), 
          clf$train, clf$predict, k=5, stratify=F, 
          measure = c('sensitivity', 'specifisity')))

CvResultsTable(kfold_labeled_res)
Classifier sensitivity specifisity
KDE 90 (±4) 90.7 (±1.2)
RF 89.1 (±3.8) 91.9 (±1)
RGP 87.7 (±2.3) 95.1 (±1.9)

Stability of labels on CV

Let’s check robustness to data subsetting. Now we will remove 20% of train data, but check unswers on the same dataset (with “Unknown” quality). Here, “correct” answer is the answer, obtained after training on the whole dataset.

train_cbs <- c(real_cbs, background_cbs)
intermediate_cbs <- setdiff(analised_cbs, train_cbs)

intermediate_res <- mapply(function(scores, clf) 
  KFoldCV(classifier_data[train_cbs,], scores[train_cbs], clf$train, clf$predict, 
          k = 5, stratify = F, test.force = list(x=classifier_data[intermediate_cbs,], 
                                                 y=scores[intermediate_cbs]), 
          measure = c('sensitivity', 'specifisity')), 
  scores_base, classifier_funcs, SIMPLIFY=F) %>%
  setNames(names(classifier_funcs))

CvResultsTable(intermediate_res)
Classifier sensitivity specifisity
KDE 90.2 (±4.1) 96.3 (±0.6)
RF 89.8 (±6.1) 96.5 (±1.9)
RGP 83.5 (±2.6) 99.3 (±0.6)

Dependency between scores and borders

Again, we will check stability on “Unknown” cells. But now, we will widen borders of real / background cells. To estimate confidence intervals we randomly removes 10% of data (as in 10-fold CV).

offset_vals <- seq(0.0, 0.95, 0.05)
offset_accs <- mclapply(names(classifier_funcs), function(n) 
  mclapply(offset_vals, function(offset) 
    ScoreDependsOnBorders(classifier_data, scores_base[[n]], classifier_funcs[[n]], 
                          real_cbs, background_cbs, offset, offset), mc.cores=20), mc.cores=2)
names(offset_accs) <- names(classifier_funcs)
plot_df <- lapply(offset_accs, function(df) lapply(1:length(offset_vals), function(i) 
  cbind(df[[i]], Measure=rownames(df[[i]]), Offset=offset_vals[i])) %>% bind_rows()) %>% 
  bind_rows(.id="Classifier") %>% mutate(mean=1-mean)
plot_df$Measure <- c('False negative rate', 'False positive rate')[as.integer(plot_df$Measure)]

gg_borders <- ggplot(plot_df, aes(x=Offset, ymin=mean-sd, ymax=mean+sd, 
                                  y=mean, color=Classifier, shape=Measure)) + 
  geom_pointrange(fatten=2.5, alpha=0.8, size=0.8) + 
  geom_line(aes(linetype=Measure), size=0.8, alpha=0.7) + 
  xlim(0, 1) + 
  scale_color_manual(values = c("#4E58E0", "#F08000", "#349147")) + 
  labs(x='Border offset', y='Value') +
  theme_pdf()

gg_borders

Robustness to errors in labels

Let’s assume that classifier answers are real class labels. We will introduce some noise to them and retrain the classifiers on the noisy labels. Here, 75% of the dataset are used to train classifiers, and 25% are used to test them.

Symmetric errors

wrong_frac_vals <- seq(0.0, 1.0, 0.1)

var_types <- c('fpr', 'fnr')
wrong_labels_vars <- list(wrong=paste0('wrong.', var_types), all=paste0('all.', var_types), test=paste0('test.', var_types))

scores_with_err <- mcmapply(function(clf, scores) 
  mclapply(wrong_frac_vals, function(frac) mclapply(1:10, function(y) 
      ScoreCellsWithWrongLabels(classifier_data, scores, clf, frac, frac, 0.25), mc.cores=2), mc.cores=10),
  classifier_funcs, scores_base, SIMPLIFY=F, mc.cores=2)

names(scores_with_err) <- names(classifier_funcs)

Test data

Results on the test subset (25% of data):

measure_names <- c(fpr='False positive rate', fnr='False negative rate')
gg_offset_test <- scores_with_err %>%
  PlotTestedClassifierErrors(wrong_frac_vals, wrong_labels_vars, var_types, 
                             filt.subset='test', measure.names=measure_names) +
  theme_pdf()

gg_offset_test

Changed labels

Results on cells with changed labels:

gg_offset_wrong <- scores_with_err %>%
  PlotTestedClassifierErrors(wrong_frac_vals, wrong_labels_vars, var_types, 
                             filt.subset='wrong', measure.names=measure_names) +
  theme_pdf()

gg_offset_wrong

Nonsymmetric errors

Errors in real CBs only:

wrong_frac_vals_one_side <- wrong_frac_vals[1:(length(wrong_frac_vals) - 1)]

scores_with_err_real <- mcmapply(function(clf, scores) 
  mclapply(wrong_frac_vals, function(frac) mclapply(1:10, function(y) 
      ScoreCellsWithWrongLabels(classifier_data, scores, clf, frac, 0.0, 0.25), mc.cores=2), 
      mc.cores=10),
  classifier_funcs, scores_base, SIMPLIFY=F, mc.cores=2)

PlotTestedClassifierErrors(scores_with_err_real, wrong_frac_vals_one_side, 
                           wrong_labels_vars, var_types) +
  theme_pdf(legend.pos=c(0, 1))

Errors in background CBs only:

scores_with_err_background <- mcmapply(function(clf, scores) 
  mclapply(wrong_frac_vals, function(frac) mclapply(1:10, function(y) 
      ScoreCellsWithWrongLabels(classifier_data, scores, clf, 0.0, frac, 0.25), mc.cores=2), 
      mc.cores=10),
  classifier_funcs, scores_base, SIMPLIFY=F, mc.cores=2)

PlotTestedClassifierErrors(scores_with_err_background, wrong_frac_vals_one_side, 
                           wrong_labels_vars, var_types) +
  theme_pdf(legend.pos=c(0, 1))

Complete figure

plotlist <- list(gg_offset_test, gg_offset_wrong, gg_borders) %>% 
  lapply(function(gg) gg + rremove("legend")) %>% 
  lapply(`+`, theme(plot.margin=ggplot2::margin()))

legend <- get_legend(
  gg_borders + 
    scale_color_manual(values = c("#4E58E0", "#F08000", "#349147"), 
                       labels=c("Kernel density estimation", "Random forest", 
                                "Robust Gaussian processes")) +
    guides(color=guide_legend(override.aes=list(size=0.25)), 
           shape=guide_legend(override.aes=list(size=0.5))) +
    theme(legend.box.background=element_blank(), 
          legend.text=element_text(size=11), 
          legend.title=element_text(size=13)))

plotlist[[1]] <- plotlist[[1]] + rremove("x.text") + rremove("x.ticks") + rremove("xlab")
plotlist[[3]] <- plotlist[[3]] + rremove("y.text") + rremove("y.ticks") + rremove("ylab")

gg_figure <- plot_grid(plotlist[[1]], legend, plotlist[[2]], plotlist[[3]], ncol=2, 
                       rel_widths=c(1, 0.9), rel_heights=c(0.9, 1),
                       labels=c('A', '', 'B', 'C'), label_y=0.98, 
                       label_x=c(0.15, 0.0, 0.15, 0.02)) +
  theme(plot.margin=ggplot2::margin(2, 2, 2, 2))

ggsave(paste0(kPlotDir, 'supp_label_noise_robustness.pdf'),
       gg_figure, width=8, height=6)
gg_figure

Session information

value
version R version 3.4.1 (2017-06-30)
os Ubuntu 14.04.5 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
tz America/New_York
date 2018-02-02
package loadedversion date source
1 assertthat 0.2.0 2017-04-11 CRAN (R 3.4.1)
2 backports 1.1.2 2017-12-13 cran (@1.1.2)
4 bindr 0.1 2016-11-13 CRAN (R 3.4.1)
5 bindrcpp 0.2 2017-06-17 CRAN (R 3.4.1)
6 clisymbols 1.2.0 2017-05-21 cran (@1.2.0)
7 colorspace 1.2-6 2015-03-11 CRAN (R 3.3.1)
9 cowplot 0.8.0 2017-07-30 CRAN (R 3.4.1)
11 digest 0.6.15 2018-01-28 cran (@0.6.15)
12 dplyr 0.7.4 2017-09-28 cran (@0.7.4)
13 dropEstAnalysis 0.6.0 2018-02-02 local (VPetukhov/dropEstAnalysis@NA)
14 dropestr 0.7.5 2018-02-01 local (@0.7.5)
15 evaluate 0.10.1 2017-06-24 cran (@0.10.1)
16 ggplot2 2.2.1 2016-12-30 cran (@2.2.1)
17 ggpubr 0.1.4 2017-06-28 CRAN (R 3.4.1)
18 ggrastr 0.1.5 2018-01-29 Github (VPetukhov/ggrastr@cc56b45)
19 git2r 0.21.0 2018-01-04 cran (@0.21.0)
20 glue 1.2.0 2017-10-29 cran (@1.2.0)
24 gtable 0.2.0 2016-02-26 CRAN (R 3.3.1)
25 highr 0.6 2016-05-09 CRAN (R 3.3.1)
26 htmltools 0.3.6 2017-04-28 cran (@0.3.6)
27 KernSmooth 2.23-15 2015-06-29 CRAN (R 3.4.0)
28 knitr 1.19 2018-01-29 cran (@1.19)
29 ks 1.11.0 2018-01-16 cran (@1.11.0)
30 labeling 0.3 2014-08-23 CRAN (R 3.3.1)
31 lattice 0.20-35 2017-03-25 CRAN (R 3.3.3)
32 lazyeval 0.2.0 2016-06-12 cran (@0.2.0)
33 magrittr 1.5 2014-11-22 CRAN (R 3.3.1)
34 Matrix 1.2-10 2017-04-28 CRAN (R 3.4.0)
35 mclust 5.4 2017-11-22 cran (@5.4)
37 munsell 0.4.3 2016-02-13 CRAN (R 3.3.1)
38 mvtnorm 1.0-7 2018-01-26 cran (@1.0-7)
40 pcaPP 1.9-73 2018-01-14 CRAN (R 3.4.1)
41 pillar 1.1.0 2018-01-14 cran (@1.1.0)
42 pkgconfig 2.0.1 2017-03-21 cran (@2.0.1)
43 plyr 1.8.4 2016-06-08 CRAN (R 3.3.1)
44 R6 2.2.2 2017-06-17 cran (@2.2.2)
45 randomForest 4.6-12 2015-10-07 CRAN (R 3.4.1)
46 Rcpp 0.12.15 2018-01-20 cran (@0.12.15)
47 reshape2 1.4.2 2016-10-22 cran (@1.4.2)
48 rlang 0.1.6 2017-12-21 cran (@0.1.6)
49 rmarkdown 1.8 2017-11-17 cran (@1.8)
50 rprojroot 1.3-2 2018-01-03 cran (@1.3-2)
51 scales 0.4.1 2016-11-09 cran (@0.4.1)
52 sessioninfo 1.0.0 2017-06-21 cran (@1.0.0)
54 stringi 1.1.1 2016-05-27 CRAN (R 3.3.1)
55 stringr 1.2.0 2017-02-18 CRAN (R 3.4.1)
56 tibble 1.4.2 2018-01-22 cran (@1.4.2)
59 withr 2.1.1 2017-12-19 cran (@2.1.1)
60 yaml 2.1.16 2017-12-12 cran (@2.1.16)

This R Markdown site was created with workflowr