1 Rmarkdown setup

The most up to date data-analysis is available on GitHub.

First, basic parameters are set up in this RMarkdown, such as loading dependencies, setting paths and setting up a uniform plot structure.

require(tidyverse)
require(reshape2)
require(broom)
require(RColorBrewer)
require(scales)
require(ggrepel)
require(gridExtra)
require(ggExtra)
require(ggplot2)
require(RCurl)
require(plotly)
require(ggpubr)
require(readxl)
require(httr)
require(psych)
require(ggbeeswarm)
require(irr)
require(asht)
require(DT)

source("HelperFunctions.R")

options(scipen = 10000)
# Themes
theme_point <-
  theme_classic2() + theme(strip.background = element_blank())

theme_bar <-
  theme_classic() + theme(
    axis.text.x = element_text(
      angle = 90,
      vjust = 0.5,
      hjust = 1
    ),
    strip.background = element_blank(),
    axis.ticks.x = element_blank(),
    axis.line.x = element_blank()
  )

theme_boxplot <-
  theme_classic() + theme(
    axis.text.x = element_text(
      angle = 90,
      vjust = 0.5,
      hjust = 1
    ),
    strip.background = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.x = element_blank(),
    axis.line.x = element_blank(),
    legend.position = "none"
  )

color.A = colorRampPalette(brewer.pal(4, "BrBG"))(4)[4]
color.B = colorRampPalette(brewer.pal(4, "BrBG"))(4)[2]
color.C = colorRampPalette(brewer.pal(4, "BrBG"))(4)[1]

url <-
  "https://www.dropbox.com/s/yovbmtua0h4hx47/supplementary_table_2_analysis.xlsx?dl=1"
GET(url, write_disk(tf <- tempfile(fileext = ".xlsx")))

analysis <-
  read_excel(tf,
             sheet = "samples",
             skip = 1,
             col_names = TRUE)
scRRBS_annotation <-
  read_excel(tf,
             sheet = "public_scRRBS_QC",
             skip = 1,
             col_names = TRUE)

url <-
  "https://www.biorxiv.org/content/biorxiv/early/2019/10/07/795047/DC2/embed/media-2.xlsx?download=true"
GET(url, write_disk(tf <- tempfile(fileext = ".xlsx")))

clinical_annotation <-
  read_excel(tf,
             sheet = "samples_final",
             skip = 1,
             col_names = TRUE)
clinical_annotation$Months_at_diagnosis <-
  as.double(clinical_annotation$Months_at_diagnosis)

url <-
  "https://www.dropbox.com/s/yovbmtua0h4hx47/supplementary_table_2_analysis.xlsx?dl=1"
GET(url, write_disk(tf <- tempfile(fileext = ".xlsx")))

HMW_ratio <-
  read_excel(tf,
             sheet = "cfDNA-HMW ratio",
             skip = 0,
             col_names = TRUE)
HMW_ratio <- HMW_ratio[!is.na(HMW_ratio$SampleID),]

fulldeconv <-
  read_excel(tf,
             sheet = "deconvolution_output",
             skip = 0,
             col_names = TRUE) %>% melt()
colnames(fulldeconv) <- c("tumor", "SampleID", "fraction")

resids_result <-
  as.data.frame(read_excel(
    tf,
    sheet = "residuals",
    skip = 0,
    col_names = TRUE
  ))

reproducibility <-
  as.data.frame(
    read_csv(
      "https://www.dropbox.com/s/0ernrimdmtquiu9/reproducibility_matrix.csv?dl=1",
      na = c("NA")
    )
  )

2 Sample annotation

sample_annotation <-
  left_join(analysis, clinical_annotation, by = "SampleID")

sample_annotation <-
  left_join(sample_annotation, HMW_ratio, by = "SampleID")

sample_annotation$IchorCNA_TFx <-
  as.numeric(sample_annotation$IchorCNA_TFx)

sample_annotation <-
  sample_annotation %>% mutate(ClassificationDetail = ifelse((sample_annotation$CorrectClassification == FALSE) &
                                                               (sample_annotation$ClassificationResult == "Normal"),
                                                             "Inconclusive",
                                                             ifelse((sample_annotation$CorrectClassification == FALSE) &
                                                                      (sample_annotation$ClassificationResult != "Normal"),
                                                                    "Incorrect tumor",
                                                                    "Correct"
                                                             )
  ))

sample_annotation <- sample_annotation %>%
  mutate(
    disease_extent = ifelse(
      sample_annotation$Metastatic_status == "M+",
      "Metastatic",
      "Localized"
    )
  )

sample_annotation <- sample_annotation %>%
  mutate(cfRRBS_sWGS_CNA = paste0(cfRRBS_CNA, "-", sWGS_CNA))

sample_annotation$cfDNA_HMW_ratio_binned = cut(
  sample_annotation$cfDNA_HMW_ratio,
  c(-Inf, 1, 5, Inf),
  labels = c("High", "Medium", "Low")
)

sample_annotation$DNA_input_binned = cut(sample_annotation$DNA_input,
                                         c(-Inf, 5, 10, Inf),
                                         labels = c("< 5 ng", "5 - 10 ng", "> 10 ng"))

sample_annotation$ClassificationDetail <-
  factor(
    sample_annotation$ClassificationDetail,
    levels = c("Incorrect tumor", "Correct", "Inconclusive")
  )

sample_annotation$labelmisclas <-
  ifelse(
    sample_annotation$ClassificationDetail == "Incorrect tumor",
    sample_annotation$ClassificationResult,
    ""
  )

datatable(
  sample_annotation,
  rownames = TRUE,
  filter = "top",
  options = list(pageLength = 5, scrollX = T),
  caption = "Overview of the sample annotation (analysis QC + sequencing QC + clinical data)"
)

datatable(
  describe(
    sample_annotation,
    omit = TRUE,
    quant = c(.25, .75)
  ),
  rownames = TRUE,
  filter = "top",
  options = list(pageLength = 5, scrollX = T),
  caption = "Descriptive statistics of the sample annotation table"
)

3 Correct and misclassifications

[1] "Overall"
[1] "The number of correct classifications: 49"
[1] "The number of misclassifications: 11"
[1] "The number of total samples: 60"
[1] "The percentage of correct classifications: 81.67%"
[1] ""
[1] "CNV profiles"
[1] "The number of correct classifications with flat CNV profile: 9"
[1] "The number of misclassifications with flat CNV profile: 9"
[1] "The number of total samples with flat CNV profile: 18"
[1] "The percentage of correct classifications with flat CNV profile: 50%"
[1] ""
[1] "Higher than 10%"
[1] "The number of correct classifications with >= 10% eTFx: 36"
[1] "The number of misclassifications with >= 10% eTFx: 0"
[1] "The number of total samples with >= 10% eTFx: 36"
[1] "The percentage of correct classifications with >= 10% eTFx: 100%"
[1] ""
[1] "Lower than 10%"
[1] "The number of correct classifications with < 10% eTFx: 13"
[1] "The number of misclassifications with < 10% eTFx: 11"
[1] "The number of total samples with < 10% eTFx: 24"
[1] "The percentage of correct classifications with < 10% eTFx: 54.17%"
[1] ""
[1] "Lower than 1%"
[1] "The number of correct classifications with < 1% eTFx: 3"
[1] "The number of misclassifications with < 1% eTFx: 4"
[1] "The number of total samples with < 1% eTFx: 7"
[1] "The percentage of correct classifications with < 1% eTFx: 42.86%"

4 Library & Sequencing QC

4.1 Reads after trimming

All samples had at least +- 15M reads.

sample_annotation$CorrectClassification <-
  as.character(sample_annotation$CorrectClassification)

sample_annotation$CorrectClassification <-
  ifelse(sample_annotation$CorrectClassification == "TRUE",
         "Correct",
         "Incorrect")

ggplot(sample_annotation,
       aes(x = M_Seqs , y = GraphTumor, col = ClassificationDetail)) +
  geom_beeswarm(
    color = "black",
    cex = 2,
    size = 2,
    groupOnX = FALSE
  ) + geom_beeswarm(cex = 2,
                    size = 1.5,
                    groupOnX = FALSE) + theme_point + labs(y = "Tumor type",
                                                           x = "# reads (M)",
                                                           col = "Classification call",
                                                           title = "Number of reads after adaptor trimming")  + xlim(0, 40) + theme(
                                                             panel.grid.major.y = element_line(linetype = "dashed", color = "gray88"),
                                                             axis.text.y = element_text(size = 8)
                                                           )

4.2 Genome-wide methylation

The genome wide methylation of CpGs is similar for all samples. There does not seem to be hypo- or hypermethylation for specific tumor types. Furthermore, misclassified samples are also not hyper or hypomethylated.

ggplot(sample_annotation,
       aes(x = Prct_mCpG, y = GraphTumor, col = ClassificationDetail)) +
  geom_beeswarm(
    color = "black",
    cex = 2,
    size = 2,
    groupOnX = FALSE
  ) + geom_beeswarm(cex = 2,
                    size = 1.5,
                    groupOnX = FALSE) + theme_point + xlim(0, 100) + labs(y = "Tumor type",
                                                                          x = "mCpG (%)",
                                                                          col = "Classification call",
                                                                          title = "Genome-wide methylation of CpGs")  + theme(
                                                                            panel.grid.major.y = element_line(linetype = "dashed", color = "gray88"),
                                                                            axis.text.y = element_text(size = 8)
                                                                          )

4.3 Percentage of on-bait bases

In general, almost all reads map within the expected regions (MspI fragments between 20 and 200 bp, evaluated with Picard HSmetrics). There are some outliers, but again this does not seem to be associated with misclassifications.

ggplot(
  sample_annotation,
  aes(x = Prct_OnBaitBases, y = GraphTumor, col = ClassificationDetail)
) +
  geom_beeswarm(
    color = "black",
    cex = 2,
    size = 2,
    groupOnX = FALSE
  ) + geom_beeswarm(cex = 2,
                    size = 1.5,
                    groupOnX = FALSE)   + theme_point + xlim(0, 100) + labs(y = "Tumor type",
                                                                            x = "On bait bases (%)",
                                                                            col = "Classification call",
                                                                            title = "Percentage of all reads mapping to \n MspI fragments between 20 and 200 bp") + theme(
                                                                              panel.grid.major.y = element_line(linetype = "dashed", color = "gray88"),
                                                                              axis.text.y = element_text(size = 8)
                                                                            )

4.4 Mapping efficiency

The mapping efficiency is usually around 50% (reads uniquely mapping to GRCh37 with Bismark). Outliers here are again not associated with misclassifications.

ggplot(sample_annotation,
       aes(x = Prct_Aligned, y = GraphTumor, col = ClassificationDetail)) +
  geom_beeswarm(
    color = "black",
    cex = 2,
    size = 2,
    groupOnX = FALSE
  ) + geom_beeswarm(cex = 2,
                    size = 1.5,
                    groupOnX = FALSE) + theme_point + xlim(0, 100) + labs(title = "Reads mapping uniquely to GRCh37 (with Bismark)",
                                                                          y = "Tumor type",
                                                                          x = "Aligned (%)",
                                                                          col = "Classification call") + theme(
                                                                            panel.grid.major.y = element_line(linetype = "dashed", color = "gray88"),
                                                                            axis.text.y = element_text(size = 8)
                                                                          )

4.5 Bisulfite conversion efficiency

Bisulfite conversion was successful in every sample (>95% bisulfite conversion efficiency). Assessed by mapping to the spiked-in lambda genome.

ggplot(
  sample_annotation,
  aes(
    x = 100 - Prct_Lambda_mCpG,
    y = GraphTumor,
    col = ClassificationDetail
  )
) +
  geom_beeswarm(
    color = "black",
    cex = 2,
    size = 2,
    groupOnX = FALSE
  ) + geom_beeswarm(cex = 2,
                    size = 1.5,
                    groupOnX = FALSE) + theme_point + xlim(92, 100) + labs(title = "Bisulfite conversion efficiency",
                                                                           y = "Tumor type",
                                                                           x = "Bisulfite conversion efficiency (%)",
                                                                           col = "Classification call") + theme(
                                                                             panel.grid.major.y = element_line(linetype = "dashed", color = "gray88"),
                                                                             axis.text.y = element_text(size = 8)
                                                                           )

4.6 cfDNA input (ng)

cfDNA input ranged from undetectable with Qubit (< 0.05 ng/µL) to 60 ng. The distributions between correctly classified and misclassified samples is not significantly different.

ggplot(
  sample_annotation,
  aes(y = DNA_input, x = CorrectClassification, col = ClassificationDetail)
) + geom_beeswarm(color = "black", cex = 2.5, size = 2) + geom_beeswarm(cex = 2.5, size = 1.5) + theme_point + stat_compare_means(
  label = "p.format",
  comparisons = list(c("Correct", "Incorrect")),
  tip.length = 0,
  bracket.size = 0.5
) + labs(title = "No association between DNA input and classification accuracy",
         y = "DNA input (ng)",
         x = "Correct classification",
         col = "Classification call")

4.7 On bait bases vs scRRBS


    Wilcoxon-Mann-Whitney test with continuity correction (confidence
    interval requires proportional odds assumption, but test does not)

data:  scRRBS_comp$Prct_OnBaitBases by scRRBS_comp$Protocol
Mann-Whitney estimate = 0, tie factor = 0.99991, p-value <
0.00000000000000022
alternative hypothesis: two distributions are not equal
95 percent confidence interval:
 0.00000000 0.03095146
sample estimates:
Mann-Whitney estimate 
                    0

[1] "cfRRBS % on bait:"

[1] "scRRBS % on bait:"

5 Classification results

5.1 Overall results

The samples that misclassified have a significantly lower eTFx (p = 0.0029, after omitting the “inconclusive” group).

class_results <-
  ggplot(
    sample_annotation %>% filter(ClassificationDetail != "Inconclusive"),
    aes(
      x = reorder(CorrectClassification, desc(CorrectClassification)),
      y = Estimated_TFx * 100,
      col = ClassificationDetail
    )
  ) +
  geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point + stat_compare_means(
    label = "p.format",
    comparisons = list(c(1, 2)),
    tip.length = 0,
    bracket.size = 0.5
  ) +
  labs(y = "Estimated TFx (%)", x = "Classfication", col = "Classification call")

ggsave(file = "classification.svg", plot = ggplot2::last_plot())
ggsave(file = "classification.png", plot = ggplot2::last_plot())

class_results

print("Summary of correctly classified tumors:")

## [1] "Summary of correctly classified tumors:"

sample_annotation %>% filter(CorrectClassification == "Correct") %>% select(Estimated_TFx) %>% summary

##  Estimated_TFx   
##  Min.   :0.0010  
##  1st Qu.:0.0880  
##  Median :0.3090  
##  Mean   :0.3459  
##  3rd Qu.:0.5480  
##  Max.   :1.0000

print("Summary of incorrectly classified tumors:")

## [1] "Summary of incorrectly classified tumors:"

sample_annotation %>% filter(CorrectClassification == "Incorrect") %>% select(Estimated_TFx) %>% summary

##  Estimated_TFx    
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.02100  
##  Mean   :0.02427  
##  3rd Qu.:0.03700  
##  Max.   :0.07600

5.2 Misclassifications by center

ggplot(sample_annotation,
       aes(
         x = reorder(Center, desc(Center)),
         y = Estimated_TFx * 100,
         col = ClassificationDetail
       )) +
  geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point +
  labs(y = "Estimated TFx (%)",
       x = "Center",
       col = "Classification call",
       title = "Misclassifications by center")

5.3 cfDNA/HMW ratio by center

ggplot(sample_annotation,
       aes(
         x = reorder(Center, desc(Center)),
         y = cfDNA_HMW_ratio,
         col = ClassificationDetail
       )) +
  geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point +
  labs(y = "cfDNA/HMW ratio", x = "Center", col = "Classification call")

5.4 Disease extent

Across all tumor types, the distribution of the eTFx is not significantly different between localized disease (incl N+) and metastatic disease. This is probably dependent on the tumor type, but there are insufficient samples per group to draw definitive conclusions from this.

print(wmwTest(
  sample_annotation$Estimated_TFx ~ sample_annotation$disease_extent
))

## 
##  Wilcoxon-Mann-Whitney test with continuity correction (confidence
##  interval requires proportional odds assumption, but test does not)
## 
## data:  sample_annotation$Estimated_TFx by sample_annotation$disease_extent
## Mann-Whitney estimate = 0.5651, tie factor = 0.99947, p-value = 0.3928
## alternative hypothesis: two distributions are not equal
## 95 percent confidence interval:
##  0.4188994 0.6990860
## sample estimates:
## Mann-Whitney estimate 
##             0.5650954

dz_extent <-
  ggplot(sample_annotation,
         aes(
           x = reorder(disease_extent, desc(disease_extent)),
           y = Estimated_TFx * 100,
           col = ClassificationDetail
         )) +
  geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point + stat_compare_means(
    label = "p.format",
    comparisons = list(c(1, 2)),
    tip.length = 0,
    bracket.size = 0.5
  ) + labs(y = "Estimated TFx (%)", x = "Disease extent", col = "Classification call")

dz_extent

print("Summary of correctly classified tumors:")

## [1] "Summary of correctly classified tumors:"

sample_annotation %>% filter(disease_extent == "Metastatic") %>% select(Estimated_TFx) %>% summary

##  Estimated_TFx   
##  Min.   :0.0000  
##  1st Qu.:0.0300  
##  Median :0.3090  
##  Mean   :0.3345  
##  3rd Qu.:0.5450  
##  Max.   :1.0000

print("Summary of incorrectly classified tumors:")

## [1] "Summary of incorrectly classified tumors:"

sample_annotation %>% filter(disease_extent == "Localized") %>% select(Estimated_TFx) %>% summary

##  Estimated_TFx  
##  Min.   :0.000  
##  1st Qu.:0.032  
##  Median :0.166  
##  Mean   :0.248  
##  3rd Qu.:0.518  
##  Max.   :0.787

5.5 Classifications by tumor class

The classifications split by tumor class:

sample_annotation$GraphTumor.ordered <-
  factor(
    sample_annotation$GraphTumor,
    levels = c(
      "NBL",
      "aRMS",
      "eRMS",
      "OS",
      "EWS",
      "WT",
      "CCSK",
      "MRT",
      "MB",
      "ATRT"
    )
  )

ggplot(
  sample_annotation,
  aes(
    x = reorder(GraphTumor.ordered, desc(GraphTumor.ordered)),
    y = Estimated_TFx * 100,
    col = ClassificationDetail,
    label = labelmisclas,
    shape = reorder(Biomaterial, desc(Biomaterial))
  )
) +
  geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point + geom_text_repel(size =
                                                                                                                           3, show.legend = FALSE) +
  theme(
    panel.grid.major.y = element_line(linetype = "dashed", color = "gray88"),
    axis.text.y = element_text(size = 8)
  ) +
  labs(y = "Estimated TFx (%)",
       x = "Tumor type",
       col = "Classification call",
       shape = "Biomaterial") +
  coord_flip()

ggsave(file = "classification_detail.svg", plot = ggplot2::last_plot())
ggsave(file = "classification_detail.png", plot = ggplot2::last_plot())

5.6 Full results of meth_atlas (plasma samples only)

This section contains all tumor fractions that are being detected per sample.

5.6.1 Table

fulldeconv <-
  fulldeconv %>% filter((tumor != "normal")) %>% filter((tumor != "wbc"))
fulldeconv_annot <-
  left_join(x = sample_annotation,
            y = fulldeconv,
            by = c("SampleID" = "SampleID"))

datatable(
  fulldeconv,
  rownames = TRUE,
  filter = "top",
  options = list(pageLength = 5, scrollX = T),
  caption = "The detected fractions for every sample"
)

5.6.2 Estimated TFx < 10%

ggplot(
  fulldeconv_annot %>% filter(Estimated_TFx < 0.10),
  aes(x = tumor, y = fraction, fill = ClassificationDetail)
)  +
  geom_bar(stat = "identity") + facet_wrap( ~ SampleID + GraphTumor, scales = "free") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(x = "tumor entity", y = "detected fraction", title = "Sample deconvolution for estimated TFx less than 10%")

5.6.3 Estimated TFx > 10% & < 30%

ggplot(
  fulldeconv_annot %>% filter((Estimated_TFx > 0.10) &
                                (Estimated_TFx < 30)),
  aes(x = tumor, y = fraction, fill = ClassificationDetail)
)  +
  geom_bar(stat = "identity") + facet_wrap( ~ SampleID + GraphTumor, scales = "free") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(x = "tumor entity", y = "detected fraction", title = "Sample deconvolution for estimated TFx between 10 and 30%")

5.6.4 Estimated TFx > 30%

ggplot(fulldeconv_annot %>% filter((Estimated_TFx > 0.30)),
       aes(x = tumor, y = fraction, fill = ClassificationDetail))  +
  geom_bar(stat = "identity") + facet_wrap( ~ SampleID + GraphTumor, scales = "free") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(x = "tumor entity", y = "detected fraction", title = "Sample deconvolution for estimated TFx higher than 30%")

5.7 Classifications by sample quality

Samples that have a low cfDNA-HMW ratio (so samples that contain relatively more HMW DNA), seem to have a lower estimated tumor fraction. For visualisations we also binned this, where a cfDNA-HMW ratio < 1 is high HMW contamination, a cfDNA-HMW ratio between 1 - 5 is medium HMW contamination and a cfDNA-HMW ratio > 5 is low HMW contamination, so a good quality sample.

ratio_cont <-
  ggplot(
    sample_annotation,
    aes(
      y = Estimated_TFx * 100,
      x = cfDNA_HMW_ratio,
      col = ClassificationDetail,
      shape = reorder(Biomaterial, desc(Biomaterial))
    )
  ) + scale_x_continuous(trans = 'log10') +
  geom_point(color = "black",  size = 2) + geom_point(size = 1.5) + theme_point +
  labs(y = "Estimated TFx (%)",
       x = "cfDNA-HMW ratio (log)",
       col = "Classification call",
       shape = "Biomaterial") + geom_vline(xintercept = 1,
                                           colour = "grey",
                                           linetype = "dashed") + geom_vline(xintercept = 5,
                                                                             colour = "grey",
                                                                             linetype = "dashed") +
  annotate(
    "text",
    x = 1,
    y = 90,
    size = 3.5,
    label = "High HMW\n",
    colour = "gray",
    angle = -270
  ) +
  annotate(
    "text",
    x = 1,
    y = 90,
    size = 3.5,
    label = "\nMedium HMW",
    colour = "gray",
    angle = 90
  ) +
  annotate(
    "text",
    x = 5,
    y = 90,
    size = 3.5,
    label = "Medium HMW\n",
    colour = "gray",
    angle = -270
  ) +
  annotate(
    "text",
    x = 5,
    y = 90,
    size = 3.5,
    label = "\nLow HMW",
    colour = "gray",
    angle = 90
  )

ratio_cont

ratio_binned <-
  ggplot(
    sample_annotation,
    aes(
      y = Estimated_TFx * 100,
      x = cfDNA_HMW_ratio_binned,
      col = ClassificationDetail,
      shape = reorder(Biomaterial, desc(Biomaterial))
    )
  ) +
  geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point +
  labs(y = "Estimated TFx (%)",
       x = "HMW contamination",
       col = "Classification call",
       shape = "Biomaterial")

ratio_binned


print(
  paste0(
    "Classification accuracy in the low HMW group: ",
    sample_annotation %>% filter(
      cfDNA_HMW_ratio_binned == "Low" &
        ClassificationDetail == "Correct"
    ) %>% nrow,
    "/",
    sample_annotation %>% filter(cfDNA_HMW_ratio_binned == "Low") %>% nrow
  )
)
[1] "Classification accuracy in the low HMW group: 30/30"

print(
  paste0(
    "Classification accuracy in the medium HMW group: ",
    sample_annotation %>% filter(
      cfDNA_HMW_ratio_binned == "Medium" &
        ClassificationDetail == "Correct"
    ) %>% nrow,
    "/",
    sample_annotation %>% filter(cfDNA_HMW_ratio_binned == "Medium") %>% nrow
  )
)
[1] "Classification accuracy in the medium HMW group: 11/20"

print(
  paste0(
    "Classification accuracy in the high HMW group: ",
    sample_annotation %>% filter(
      cfDNA_HMW_ratio_binned == "High" &
        ClassificationDetail == "Correct"
    ) %>% nrow,
    "/",
    sample_annotation %>% filter(cfDNA_HMW_ratio_binned == "High") %>% nrow
  )
)
[1] "Classification accuracy in the high HMW group: 8/10"

print(
  paste0(
    "Classification accuracy in the medium + high HMW group: ",
    sample_annotation %>% filter((
      cfDNA_HMW_ratio_binned == "Medium" |
        cfDNA_HMW_ratio_binned == "High"
    ) &
      ClassificationDetail == "Correct"
    ) %>% nrow,
    "/",
    sample_annotation %>% filter((
      cfDNA_HMW_ratio_binned == "Medium" |
        cfDNA_HMW_ratio_binned == "High"
    )
    ) %>% nrow
  )
)
[1] "Classification accuracy in the medium + high HMW group: 19/30"

print("Samples with discordant CNV profiles")
[1] "Samples with discordant CNV profiles"
sample_annotation_CNAFlat <-
  sample_annotation %>% filter(cfRRBS_sWGS_CNA  == "Flat-CNA")
print(paste0(
  "Low HMW: ",
  sample_annotation_CNAFlat %>% filter(cfDNA_HMW_ratio_binned == "Low") %>% nrow()
))
[1] "Low HMW: 0"
print(paste0(
  "High HMW: ",
  sample_annotation_CNAFlat %>% filter(cfDNA_HMW_ratio_binned == "High") %>% nrow()
))
[1] "High HMW: 5"
print(paste0(
  "Medium HMW: ",
  sample_annotation_CNAFlat %>% filter(cfDNA_HMW_ratio_binned == "Medium") %>% nrow()
))
[1] "Medium HMW: 5"
print(paste0("Total Flat-CNA: ", sample_annotation_CNAFlat %>% nrow()))
[1] "Total Flat-CNA: 10"

print("Samples with concordant CNV profiles")
[1] "Samples with concordant CNV profiles"
sample_annotation_CNACNA <-
  sample_annotation %>% filter((cfRRBS_sWGS_CNA  == "CNA-CNA") |
                                 (cfRRBS_sWGS_CNA  == "Flat-Flat"))

sample_annotation %>% filter((cfRRBS_sWGS_CNA  == "CNA-CNA")) %>% select(CNV_pearson) %>% summary
  CNV_pearson    
 Min.   :0.1300  
 1st Qu.:0.8175  
 Median :0.8750  
 Mean   :0.8228  
 3rd Qu.:0.9400  
 Max.   :0.9800  

print(paste0(
  "Low HMW: ",
  sample_annotation_CNACNA %>% filter(cfDNA_HMW_ratio_binned == "Low") %>% nrow()
))
[1] "Low HMW: 30"
print(paste0(
  "High HMW: ",
  sample_annotation_CNACNA  %>% filter(cfDNA_HMW_ratio_binned == "High") %>% nrow()
))
[1] "High HMW: 5"
print(paste0(
  "Medium HMW: ",
  sample_annotation_CNACNA  %>% filter(cfDNA_HMW_ratio_binned == "Medium") %>% nrow()
))
[1] "Medium HMW: 15"
print(paste0(
  "Total CNA-CNA or Flat-Flat: ",
  sample_annotation_CNACNA  %>% nrow()
))
[1] "Total CNA-CNA or Flat-Flat: 50"

print("Summary of correctly classified tumors:")
[1] "Summary of correctly classified tumors:"
sample_annotation %>% filter(disease_extent == "Metastatic") %>% select(Estimated_TFx) %>% summary
 Estimated_TFx   
 Min.   :0.0000  
 1st Qu.:0.0300  
 Median :0.3090  
 Mean   :0.3345  
 3rd Qu.:0.5450  
 Max.   :1.0000  

datatable(
  describe(
    sample_annotation_CNACNA,
    omit = TRUE,
    quant = c(.25, .75)
  ),
  rownames = TRUE,
  filter = "top",
  options = list(pageLength = 5, scrollX = T),
  caption = "Descriptive statistics of concordant samples (either CNA-CNA or flat-flat)"
)

5.8 IchorCNA - cfRRBS tumor fraction estimation

There is a good correlation between IchorCNA and cfRRBS to estimate the tumor fraction for samples with medium to low HMW contamination. Although the Spearman R seems to be higher in the Medium HMW contamination group, this is possibly due to the fact that there are less samples in that group.

ggplot(
  sample_annotation,
  aes(
    y = Estimated_TFx * 100,
    x = IchorCNA_TFx * 100,
    col = cfDNA_HMW_ratio_binned,
    fill = cfDNA_HMW_ratio_binned
  )
) +
  geom_smooth(method = "lm", se = TRUE) + geom_point(color = "black", size = 2) + geom_point(size = 1.5) + theme_point +
  geom_abline(
    slope = 1,
    intercept = 0,
    color = "gray",
    linetype = "dashed"
  ) +
  labs(
    x = "CNV-based eTFx (%)",
    y = "Methylation-based eTFx (%)",
    col = "HMW contamination",
    shape = "HMW contamination",
    fill = "HMW contamination",
    title = "Correlation between IchorCNA and cfRRBS-\nbased tumor fraction estimation"
  ) + ylim(NA, 100) + xlim(NA, 100) + stat_cor(method = "spearman") + scale_color_brewer(palette = "Greens") + scale_fill_brewer(palette =
                                                                                                                                             "Greens")

5.9 WisecondorX vs ichorCNA

If no CNAs are seen with WisecondorX, the CNV-based eTFx is close to 0.

ggplot(
  sample_annotation,
  aes(
    y = IchorCNA_TFx * 100,
    x =  sWGS_CNA ,
    col = cfDNA_HMW_ratio_binned,
    fill = cfDNA_HMW_ratio_binned
  )
) +
  geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point +
  labs(
    x = "WisecondorX result",
    y = "IchorCNA-based eTFx (%)",
    col = "HMW contamination",
    fill = "HMW contamination",
    title = "IchorCNA vs WisecondorX"
  )

describe(
  sample_annotation %>% filter(sWGS_CNA == "Flat") %>% select(IchorCNA_TFx) %>% pull(),
  omit = TRUE,
  quant = c(.25, .75)
)

5.10 cfRRBS - sWGS CNA correlation

If we can detect CNA with the cfRRBS method, the correlation with sWGS is high if enough input DNA was used.
It is expected that the correlation between two flat profiles is low (ideally 0, but can be higher due to noisy profiles like we see with the cfRRBS method).
The discordant samples (Flat with cfRRBS, CNA with sWGS) are again due to HMW contamination (dilution of the cfDNA peak).

ggplot(
  sample_annotation,
  aes(
    x = cfRRBS_sWGS_CNA,
    y = CNV_pearson,
    fill = cfDNA_HMW_ratio_binned,
    size = DNA_input_binned
  )
)  + scale_size_discrete(range = c(1, 4)) +
  geom_beeswarm(
    shape = 21,
    cex = 3,
    color = "black",
    stroke = 0.8
  ) + theme_point +
  labs(
    title = "Correlation between sWGS and cfRRBS-based \n CNV calling",
    x = "cfRRBS - sWGS copy number aberrations",
    y = "Pearson R",
    size = "DNA input (ng)",
    shape = "HMW contamination",
    fill = "HMW contamination"
  ) + scale_fill_brewer(palette = "Greens")

ConcordantLowToMed <-
  sample_annotation %>% filter((
    cfDNA_HMW_ratio_binned == "Medium" |
      cfDNA_HMW_ratio_binned == "Low"
  ) &
    (cfRRBS_sWGS_CNA == "CNA-CNA"  |
       cfRRBS_sWGS_CNA == "Flat-Flat")
  ) %>% nrow()
ConcordantHigh <-
  sample_annotation %>% filter((cfDNA_HMW_ratio_binned == "High") &
                                 (cfRRBS_sWGS_CNA == "CNA-CNA"  |
                                    cfRRBS_sWGS_CNA == "Flat-Flat")
  ) %>% nrow()
DiscordantLowToMed <-
  sample_annotation %>% filter((
    cfDNA_HMW_ratio_binned == "Medium" |
      cfDNA_HMW_ratio_binned == "Low"
  ) & (cfRRBS_sWGS_CNA == "Flat-CNA")
  ) %>% nrow()
DiscordantHigh <-
  sample_annotation %>% filter((cfDNA_HMW_ratio_binned == "High") &
                                 (cfRRBS_sWGS_CNA == "Flat-CNA")) %>% nrow()

FisherTest_df <-
  data.frame(
    row.names = c("Concordant", "Discordant"),
    LowToMed = c(ConcordantLowToMed, DiscordantLowToMed),
    High = c(ConcordantHigh, DiscordantHigh)
  )

chisq.test(FisherTest_df)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  FisherTest_df
## X-squared = 6.936, df = 1, p-value = 0.008448

6 NNLS model evaluation

datatable(
  resids_result,
  rownames = TRUE,
  filter = "top",
  options = list(pageLength = 5, scrollX = T),
  caption = "Residuals of NNLS"
)

rownames(resids_result) <- resids_result[, 1]

rownames(resids_result) <- sub("-.*", "", rownames(resids_result))
rownames(resids_result) <-
  sub("snake*", "", rownames(resids_result))
tmp <-
  rownames(resids_result)[(rownames(resids_result) %in% sample_annotation$SampleID)]

resids_result <- resids_result[tmp, ]
resids_result$SampleID <- rownames(resids_result)
resids_result$X1 <- NULL

resids_result <-
  merge(resids_result,
        sample_annotation,
        by.x = "SampleID",
        by.y = "SampleID")

p <-
  ggplot(resids_result, aes(sample = scale(Residuals))) + stat_qq() + stat_qq_line() + labs(title = "QQ-plot of residuals") + theme_point

p

p <-
  ggplot(resids_result, aes(x = scale(Residuals))) + geom_histogram(bins = 20) + labs(title = "Histogram of residuals") + theme_point

p

7 Reproducibility plot

The regions that were selected for comparison of RRBS data with Illumina HumanMethylation 450K array data are consistently covered in all samples. Eight regions of the total 14103 regions were not covered in any of the samples (total number of regions used = 14095). A read cut-off of 30 read per region was used for all downstream analyses.

row.names(reproducibility) <- reproducibility[, 1]
reproducibility <- reproducibility[, -1]

reproducibility$`Read cutoff` <-
  gsub("rco", "", reproducibility$`Read cutoff`)

reproducibility$`Read cutoff.ordered` <-
  factor(reproducibility$`Read cutoff`, levels = c("1", "30", "100"))

ggplot(reproducibility,
       aes(y = bin, x = count, col = `Read cutoff.ordered`)) + geom_line() + labs(y = "% of samples (n = 60)", x = "% of regions recurrently covered (n = 14095)", col = "read cutoff") +   theme_point + scale_y_continuous(breaks =
                                                                                                                                                                                                                               seq(0, 100, 20))  + scale_x_continuous(breaks = seq(0, 100, 20))

8 Grid plots

class_results <-
  ggplot(
    sample_annotation,
    aes(x = CorrectClassification, y = Estimated_TFx * 100, col = ClassificationDetail)
  ) +
  geom_beeswarm(color = "black", cex = 3.5, size = 2) + geom_beeswarm(cex = 3.5, size = 1.5) + theme_point + stat_compare_means(
    label = "p.signif",
    comparisons = list(c("Correct", "Incorrect")),
    tip.length = 0,
    bracket.size = 0.5
  ) +
  labs(y = "Estimated TFx (%)", x = "Classfication", col = "Classification call")

dz_extent <-
  ggplot(sample_annotation,
         aes(
           x = reorder(disease_extent, desc(disease_extent)),
           y = Estimated_TFx * 100,
           col = ClassificationDetail
         )) +
  geom_beeswarm(color = "black", cex = 3.5, size = 2) + geom_beeswarm(cex = 3.5, size = 1.5) + theme_point + stat_compare_means(
    label = "p.format",
    comparisons = list(c(1, 2)),
    tip.length = 0,
    bracket.size = 0.5
  ) + labs(y = "Estimated TFx (%)", x = "Disease extent", col = "Classification call")

ratio_cont <-
  ggplot(
    sample_annotation,
    aes(
      y = Estimated_TFx * 100,
      x = cfDNA_HMW_ratio,
      col = ClassificationDetail,
      shape = reorder(Biomaterial, desc(Biomaterial))
    )
  ) + scale_x_continuous(trans = 'log10') +
  geom_point(color = "black",  size = 2) + geom_point(size = 1.5) + theme_point +
  labs(y = "Estimated TFx (%)",
       x = "cfDNA-HMW ratio (log)",
       col = "Classification call",
       shape = "Biomaterial") + geom_vline(xintercept = 1,
                                           colour = "grey",
                                           linetype = "dashed") + geom_vline(xintercept = 5,
                                                                             colour = "grey",
                                                                             linetype = "dashed") +
  annotate(
    "text",
    x = 1,
    y = 90,
    size = 3.5,
    label = "High HMW\n",
    colour = "gray",
    angle = -270
  ) +
  annotate(
    "text",
    x = 1,
    y = 90,
    size = 3.5,
    label = "\nMedium HMW",
    colour = "gray",
    angle = 90
  ) +
  annotate(
    "text",
    x = 5,
    y = 90,
    size = 3.5,
    label = "Medium HMW\n",
    colour = "gray",
    angle = -270
  ) +
  annotate(
    "text",
    x = 5,
    y = 90,
    size = 3.5,
    label = "\nLow HMW",
    colour = "gray",
    angle = 90
  )

ratio_binned <-
  ggplot(
    sample_annotation,
    aes(
      y = Estimated_TFx * 100,
      x = cfDNA_HMW_ratio_binned,
      col = ClassificationDetail,
      shape = reorder(Biomaterial, desc(Biomaterial))
    )
  ) +
  geom_beeswarm(color = "black", cex = 3.5, size = 2) + geom_beeswarm(cex = 3.5, size = 1.5) + theme_point +
  labs(y = "Estimated TFx (%)",
       x = "HMW contamination",
       col = "Classification call",
       shape = "Biomaterial")

corrCNA <-
  ggplot(
    sample_annotation,
    aes(
      y = Estimated_TFx * 100,
      x = IchorCNA_TFx * 100,
      col = cfDNA_HMW_ratio_binned,
      fill = cfDNA_HMW_ratio_binned
    )
  ) +
  geom_smooth(method = "lm",
              se = TRUE,
              fullrange = FALSE) + geom_point(color = "black", size = 2) + geom_point(size = 1.5) + theme_point +
  geom_abline(
    slope = 1,
    intercept = 0,
    color = "gray",
    linetype = "dashed"
  ) +
  labs(
    x = "CNV-based eTFx (%)",
    y = "Methylation-based eTFx (%)",
    col = "HMW contamination",
    shape = "HMW contamination",
    fill = "HMW contamination"
  ) + ylim(NA, 100) + xlim(NA, 100) + stat_cor(method = "spearman", show.legend =  FALSE) + scale_color_brewer(palette = "Greens") + scale_fill_brewer(palette =
                                                                                                                                                                   "Greens")

CNA_R <-
  ggplot(
    sample_annotation,
    aes(x = cfRRBS_sWGS_CNA, y = CNV_pearson, col = cfDNA_HMW_ratio_binned)
  )  +
  geom_beeswarm(color = "black", cex = 3.5, size = 2) + geom_beeswarm(cex = 3.5, size = 1.5) + theme_point +
  labs(
    x = "cfRRBS - sWGS copy number aberrations",
    y = "Pearson R",
    size = "DNA input (ng)",
    shape = "HMW  contamination",
    fill = "HMW contamination"
  ) + scale_color_brewer(palette = "Greens")

grid1 <- grid_arrange_shared_legend(class_results,dz_extent, position = "right")

grid1
ggsave(file="grid1.svg", plot=grid1, width=8, height=3.5)
ggsave(file="grid1.png", plot=grid1, width=8, height=3.5)

grid2 <- grid_arrange_shared_legend(ratio_cont ,ratio_binned, position = "right")

grid2
ggsave(file="grid2.svg", plot=grid2, width=8, height=3.5)
ggsave(file="grid2.png", plot=grid2, width=8, height=3.5)

grid3 <- grid_arrange_shared_legend(corrCNA ,CNA_R, position = "right")

grid3
ggsave(file="grid3.svg", plot=grid3, width=8, height=3.5)
ggsave(file="grid3.png", plot=grid3, width=8, height=3.5)

9 Session info

sessionInfo()

## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] gdtools_0.2.1      DT_0.13            asht_0.9.4         coin_1.3-1        
##  [5] bpcp_1.4           survival_3.1-8     exact2x2_1.6.3.1   exactci_1.3-3     
##  [9] ssanv_1.1          irr_0.84.1         lpSolve_5.6.15     ggbeeswarm_0.6.0  
## [13] psych_1.9.12.31    httr_1.4.1         readxl_1.3.1       ggpubr_0.2.5      
## [17] magrittr_1.5       plotly_4.9.2.1     RCurl_1.98-1.1     ggExtra_0.9       
## [21] gridExtra_2.3      ggrepel_0.8.2      scales_1.1.0       RColorBrewer_1.1-2
## [25] broom_0.5.5        reshape2_1.4.3     forcats_0.5.0      stringr_1.4.0     
## [29] dplyr_0.8.5        purrr_0.3.3        readr_1.3.1        tidyr_1.0.2       
## [33] tibble_3.0.0       ggplot2_3.3.0      tidyverse_1.3.0   
## 
## loaded via a namespace (and not attached):
##  [1] TH.data_1.0-10     colorspace_1.4-1   ggsignif_0.6.0     ellipsis_0.3.0    
##  [5] modeltools_0.2-23  fs_1.4.0           rstudioapi_0.11    farver_2.0.3      
##  [9] fansi_0.4.1        mvtnorm_1.1-0      lubridate_1.7.4    xml2_1.3.0        
## [13] codetools_0.2-16   splines_3.6.3      mnormt_1.5-6       libcoin_1.0-5     
## [17] knitr_1.28         jsonlite_1.6.1     dbplyr_1.4.2       shiny_1.4.0.2     
## [21] compiler_3.6.3     backports_1.1.5    assertthat_0.2.1   Matrix_1.2-18     
## [25] fastmap_1.0.1      lazyeval_0.2.2     cli_2.0.2          later_1.0.0       
## [29] htmltools_0.4.0    tools_3.6.3        perm_1.0-0.0       gtable_0.3.0      
## [33] glue_1.3.2         Rcpp_1.0.4         cellranger_1.1.0   vctrs_0.2.4       
## [37] svglite_1.2.3      nlme_3.1-144       crosstalk_1.1.0.1  xfun_0.12         
## [41] rvest_0.3.5        mime_0.9           miniUI_0.1.1.1     lifecycle_0.2.0   
## [45] MASS_7.3-51.5      zoo_1.8-7          hms_0.5.3          promises_1.1.0    
## [49] parallel_3.6.3     sandwich_2.5-1     curl_4.3           yaml_2.2.1        
## [53] stringi_1.4.6      systemfonts_0.1.1  rlang_0.4.5        pkgconfig_2.0.3   
## [57] bitops_1.0-6       matrixStats_0.56.0 evaluate_0.14      lattice_0.20-38   
## [61] labeling_0.3       htmlwidgets_1.5.1  tidyselect_1.0.0   plyr_1.8.6        
## [65] R6_2.4.1           generics_0.0.2     multcomp_1.4-12    DBI_1.1.0         
## [69] mgcv_1.8-31        pillar_1.4.3       haven_2.2.0        withr_2.1.2       
## [73] modelr_0.1.6       crayon_1.3.4       rmarkdown_2.1      data.table_1.12.8 
## [77] reprex_0.3.0       digest_0.6.25      xtable_1.8-4       httpuv_1.5.2      
## [81] stats4_3.6.3       munsell_0.5.0      beeswarm_0.2.3     viridisLite_0.3.0 
## [85] vipor_0.4.5

Minimally invasive classification of pediatric solid tumors using reduced representation bisulfite sequencing of cell-free DNA: a proof-of-principle study: supplementary data

12/05/2020