The most up to date data-analysis is available on GitHub.
First, basic parameters are set up in this RMarkdown, such as loading dependencies, setting paths and setting up a uniform plot structure.
require(tidyverse)
require(reshape2)
require(broom)
require(RColorBrewer)
require(scales)
require(ggrepel)
require(gridExtra)
require(ggExtra)
require(ggplot2)
require(RCurl)
require(plotly)
require(ggpubr)
require(readxl)
require(httr)
require(psych)
require(ggbeeswarm)
require(irr)
require(asht)
require(DT)
source("HelperFunctions.R")
options(scipen = 10000)
# Themes
theme_point <-
theme_classic2() + theme(strip.background = element_blank())
theme_bar <-
theme_classic() + theme(
axis.text.x = element_text(
angle = 90,
vjust = 0.5,
hjust = 1
),
strip.background = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank()
)
theme_boxplot <-
theme_classic() + theme(
axis.text.x = element_text(
angle = 90,
vjust = 0.5,
hjust = 1
),
strip.background = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
axis.line.x = element_blank(),
legend.position = "none"
)
color.A = colorRampPalette(brewer.pal(4, "BrBG"))(4)[4]
color.B = colorRampPalette(brewer.pal(4, "BrBG"))(4)[2]
color.C = colorRampPalette(brewer.pal(4, "BrBG"))(4)[1]
url <-
"https://www.dropbox.com/s/yovbmtua0h4hx47/supplementary_table_2_analysis.xlsx?dl=1"
GET(url, write_disk(tf <- tempfile(fileext = ".xlsx")))
analysis <-
read_excel(tf,
sheet = "samples",
skip = 1,
col_names = TRUE)
scRRBS_annotation <-
read_excel(tf,
sheet = "public_scRRBS_QC",
skip = 1,
col_names = TRUE)
url <-
"https://www.biorxiv.org/content/biorxiv/early/2019/10/07/795047/DC2/embed/media-2.xlsx?download=true"
GET(url, write_disk(tf <- tempfile(fileext = ".xlsx")))
clinical_annotation <-
read_excel(tf,
sheet = "samples_final",
skip = 1,
col_names = TRUE)
clinical_annotation$Months_at_diagnosis <-
as.double(clinical_annotation$Months_at_diagnosis)
url <-
"https://www.dropbox.com/s/yovbmtua0h4hx47/supplementary_table_2_analysis.xlsx?dl=1"
GET(url, write_disk(tf <- tempfile(fileext = ".xlsx")))
HMW_ratio <-
read_excel(tf,
sheet = "cfDNA-HMW ratio",
skip = 0,
col_names = TRUE)
HMW_ratio <- HMW_ratio[!is.na(HMW_ratio$SampleID),]
fulldeconv <-
read_excel(tf,
sheet = "deconvolution_output",
skip = 0,
col_names = TRUE) %>% melt()
colnames(fulldeconv) <- c("tumor", "SampleID", "fraction")
resids_result <-
as.data.frame(read_excel(
tf,
sheet = "residuals",
skip = 0,
col_names = TRUE
))
reproducibility <-
as.data.frame(
read_csv(
"https://www.dropbox.com/s/0ernrimdmtquiu9/reproducibility_matrix.csv?dl=1",
na = c("NA")
)
)
sample_annotation <-
left_join(analysis, clinical_annotation, by = "SampleID")
sample_annotation <-
left_join(sample_annotation, HMW_ratio, by = "SampleID")
sample_annotation$IchorCNA_TFx <-
as.numeric(sample_annotation$IchorCNA_TFx)
sample_annotation <-
sample_annotation %>% mutate(ClassificationDetail = ifelse((sample_annotation$CorrectClassification == FALSE) &
(sample_annotation$ClassificationResult == "Normal"),
"Inconclusive",
ifelse((sample_annotation$CorrectClassification == FALSE) &
(sample_annotation$ClassificationResult != "Normal"),
"Incorrect tumor",
"Correct"
)
))
sample_annotation <- sample_annotation %>%
mutate(
disease_extent = ifelse(
sample_annotation$Metastatic_status == "M+",
"Metastatic",
"Localized"
)
)
sample_annotation <- sample_annotation %>%
mutate(cfRRBS_sWGS_CNA = paste0(cfRRBS_CNA, "-", sWGS_CNA))
sample_annotation$cfDNA_HMW_ratio_binned = cut(
sample_annotation$cfDNA_HMW_ratio,
c(-Inf, 1, 5, Inf),
labels = c("High", "Medium", "Low")
)
sample_annotation$DNA_input_binned = cut(sample_annotation$DNA_input,
c(-Inf, 5, 10, Inf),
labels = c("< 5 ng", "5 - 10 ng", "> 10 ng"))
sample_annotation$ClassificationDetail <-
factor(
sample_annotation$ClassificationDetail,
levels = c("Incorrect tumor", "Correct", "Inconclusive")
)
sample_annotation$labelmisclas <-
ifelse(
sample_annotation$ClassificationDetail == "Incorrect tumor",
sample_annotation$ClassificationResult,
""
)
datatable(
sample_annotation,
rownames = TRUE,
filter = "top",
options = list(pageLength = 5, scrollX = T),
caption = "Overview of the sample annotation (analysis QC + sequencing QC + clinical data)"
)
[1] "Overall"
[1] "The number of correct classifications: 49"
[1] "The number of misclassifications: 11"
[1] "The number of total samples: 60"
[1] "The percentage of correct classifications: 81.67%"
[1] ""
[1] "CNV profiles"
[1] "The number of correct classifications with flat CNV profile: 9"
[1] "The number of misclassifications with flat CNV profile: 9"
[1] "The number of total samples with flat CNV profile: 18"
[1] "The percentage of correct classifications with flat CNV profile: 50%"
[1] ""
[1] "Higher than 10%"
[1] "The number of correct classifications with >= 10% eTFx: 36"
[1] "The number of misclassifications with >= 10% eTFx: 0"
[1] "The number of total samples with >= 10% eTFx: 36"
[1] "The percentage of correct classifications with >= 10% eTFx: 100%"
[1] ""
[1] "Lower than 10%"
[1] "The number of correct classifications with < 10% eTFx: 13"
[1] "The number of misclassifications with < 10% eTFx: 11"
[1] "The number of total samples with < 10% eTFx: 24"
[1] "The percentage of correct classifications with < 10% eTFx: 54.17%"
[1] ""
[1] "Lower than 1%"
[1] "The number of correct classifications with < 1% eTFx: 3"
[1] "The number of misclassifications with < 1% eTFx: 4"
[1] "The number of total samples with < 1% eTFx: 7"
[1] "The percentage of correct classifications with < 1% eTFx: 42.86%"
All samples had at least +- 15M reads.
sample_annotation$CorrectClassification <-
as.character(sample_annotation$CorrectClassification)
sample_annotation$CorrectClassification <-
ifelse(sample_annotation$CorrectClassification == "TRUE",
"Correct",
"Incorrect")
ggplot(sample_annotation,
aes(x = M_Seqs , y = GraphTumor, col = ClassificationDetail)) +
geom_beeswarm(
color = "black",
cex = 2,
size = 2,
groupOnX = FALSE
) + geom_beeswarm(cex = 2,
size = 1.5,
groupOnX = FALSE) + theme_point + labs(y = "Tumor type",
x = "# reads (M)",
col = "Classification call",
title = "Number of reads after adaptor trimming") + xlim(0, 40) + theme(
panel.grid.major.y = element_line(linetype = "dashed", color = "gray88"),
axis.text.y = element_text(size = 8)
)
The genome wide methylation of CpGs is similar for all samples. There does not seem to be hypo- or hypermethylation for specific tumor types. Furthermore, misclassified samples are also not hyper or hypomethylated.
ggplot(sample_annotation,
aes(x = Prct_mCpG, y = GraphTumor, col = ClassificationDetail)) +
geom_beeswarm(
color = "black",
cex = 2,
size = 2,
groupOnX = FALSE
) + geom_beeswarm(cex = 2,
size = 1.5,
groupOnX = FALSE) + theme_point + xlim(0, 100) + labs(y = "Tumor type",
x = "mCpG (%)",
col = "Classification call",
title = "Genome-wide methylation of CpGs") + theme(
panel.grid.major.y = element_line(linetype = "dashed", color = "gray88"),
axis.text.y = element_text(size = 8)
)
In general, almost all reads map within the expected regions (MspI fragments between 20 and 200 bp, evaluated with Picard HSmetrics). There are some outliers, but again this does not seem to be associated with misclassifications.
ggplot(
sample_annotation,
aes(x = Prct_OnBaitBases, y = GraphTumor, col = ClassificationDetail)
) +
geom_beeswarm(
color = "black",
cex = 2,
size = 2,
groupOnX = FALSE
) + geom_beeswarm(cex = 2,
size = 1.5,
groupOnX = FALSE) + theme_point + xlim(0, 100) + labs(y = "Tumor type",
x = "On bait bases (%)",
col = "Classification call",
title = "Percentage of all reads mapping to \n MspI fragments between 20 and 200 bp") + theme(
panel.grid.major.y = element_line(linetype = "dashed", color = "gray88"),
axis.text.y = element_text(size = 8)
)
The mapping efficiency is usually around 50% (reads uniquely mapping to GRCh37 with Bismark). Outliers here are again not associated with misclassifications.
ggplot(sample_annotation,
aes(x = Prct_Aligned, y = GraphTumor, col = ClassificationDetail)) +
geom_beeswarm(
color = "black",
cex = 2,
size = 2,
groupOnX = FALSE
) + geom_beeswarm(cex = 2,
size = 1.5,
groupOnX = FALSE) + theme_point + xlim(0, 100) + labs(title = "Reads mapping uniquely to GRCh37 (with Bismark)",
y = "Tumor type",
x = "Aligned (%)",
col = "Classification call") + theme(
panel.grid.major.y = element_line(linetype = "dashed", color = "gray88"),
axis.text.y = element_text(size = 8)
)
Bisulfite conversion was successful in every sample (>95% bisulfite conversion efficiency). Assessed by mapping to the spiked-in lambda genome.
ggplot(
sample_annotation,
aes(
x = 100 - Prct_Lambda_mCpG,
y = GraphTumor,
col = ClassificationDetail
)
) +
geom_beeswarm(
color = "black",
cex = 2,
size = 2,
groupOnX = FALSE
) + geom_beeswarm(cex = 2,
size = 1.5,
groupOnX = FALSE) + theme_point + xlim(92, 100) + labs(title = "Bisulfite conversion efficiency",
y = "Tumor type",
x = "Bisulfite conversion efficiency (%)",
col = "Classification call") + theme(
panel.grid.major.y = element_line(linetype = "dashed", color = "gray88"),
axis.text.y = element_text(size = 8)
)
cfDNA input ranged from undetectable with Qubit (< 0.05 ng/µL) to 60 ng. The distributions between correctly classified and misclassified samples is not significantly different.
ggplot(
sample_annotation,
aes(y = DNA_input, x = CorrectClassification, col = ClassificationDetail)
) + geom_beeswarm(color = "black", cex = 2.5, size = 2) + geom_beeswarm(cex = 2.5, size = 1.5) + theme_point + stat_compare_means(
label = "p.format",
comparisons = list(c("Correct", "Incorrect")),
tip.length = 0,
bracket.size = 0.5
) + labs(title = "No association between DNA input and classification accuracy",
y = "DNA input (ng)",
x = "Correct classification",
col = "Classification call")
Wilcoxon-Mann-Whitney test with continuity correction (confidence
interval requires proportional odds assumption, but test does not)
data: scRRBS_comp$Prct_OnBaitBases by scRRBS_comp$Protocol
Mann-Whitney estimate = 0, tie factor = 0.99991, p-value <
0.00000000000000022
alternative hypothesis: two distributions are not equal
95 percent confidence interval:
0.00000000 0.03095146
sample estimates:
Mann-Whitney estimate
0
[1] "cfRRBS % on bait:"
[1] "scRRBS % on bait:"
The samples that misclassified have a significantly lower eTFx (p = 0.0029, after omitting the “inconclusive” group).
class_results <-
ggplot(
sample_annotation %>% filter(ClassificationDetail != "Inconclusive"),
aes(
x = reorder(CorrectClassification, desc(CorrectClassification)),
y = Estimated_TFx * 100,
col = ClassificationDetail
)
) +
geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point + stat_compare_means(
label = "p.format",
comparisons = list(c(1, 2)),
tip.length = 0,
bracket.size = 0.5
) +
labs(y = "Estimated TFx (%)", x = "Classfication", col = "Classification call")
ggsave(file = "classification.svg", plot = ggplot2::last_plot())
ggsave(file = "classification.png", plot = ggplot2::last_plot())
class_results
## [1] "Summary of correctly classified tumors:"
## Estimated_TFx
## Min. :0.0010
## 1st Qu.:0.0880
## Median :0.3090
## Mean :0.3459
## 3rd Qu.:0.5480
## Max. :1.0000
## [1] "Summary of incorrectly classified tumors:"
## Estimated_TFx
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.02100
## Mean :0.02427
## 3rd Qu.:0.03700
## Max. :0.07600
ggplot(sample_annotation,
aes(
x = reorder(Center, desc(Center)),
y = Estimated_TFx * 100,
col = ClassificationDetail
)) +
geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point +
labs(y = "Estimated TFx (%)",
x = "Center",
col = "Classification call",
title = "Misclassifications by center")
ggplot(sample_annotation,
aes(
x = reorder(Center, desc(Center)),
y = cfDNA_HMW_ratio,
col = ClassificationDetail
)) +
geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point +
labs(y = "cfDNA/HMW ratio", x = "Center", col = "Classification call")
Across all tumor types, the distribution of the eTFx is not significantly different between localized disease (incl N+) and metastatic disease. This is probably dependent on the tumor type, but there are insufficient samples per group to draw definitive conclusions from this.
##
## Wilcoxon-Mann-Whitney test with continuity correction (confidence
## interval requires proportional odds assumption, but test does not)
##
## data: sample_annotation$Estimated_TFx by sample_annotation$disease_extent
## Mann-Whitney estimate = 0.5651, tie factor = 0.99947, p-value = 0.3928
## alternative hypothesis: two distributions are not equal
## 95 percent confidence interval:
## 0.4188994 0.6990860
## sample estimates:
## Mann-Whitney estimate
## 0.5650954
dz_extent <-
ggplot(sample_annotation,
aes(
x = reorder(disease_extent, desc(disease_extent)),
y = Estimated_TFx * 100,
col = ClassificationDetail
)) +
geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point + stat_compare_means(
label = "p.format",
comparisons = list(c(1, 2)),
tip.length = 0,
bracket.size = 0.5
) + labs(y = "Estimated TFx (%)", x = "Disease extent", col = "Classification call")
dz_extent
## [1] "Summary of correctly classified tumors:"
## Estimated_TFx
## Min. :0.0000
## 1st Qu.:0.0300
## Median :0.3090
## Mean :0.3345
## 3rd Qu.:0.5450
## Max. :1.0000
## [1] "Summary of incorrectly classified tumors:"
## Estimated_TFx
## Min. :0.000
## 1st Qu.:0.032
## Median :0.166
## Mean :0.248
## 3rd Qu.:0.518
## Max. :0.787
The classifications split by tumor class:
sample_annotation$GraphTumor.ordered <-
factor(
sample_annotation$GraphTumor,
levels = c(
"NBL",
"aRMS",
"eRMS",
"OS",
"EWS",
"WT",
"CCSK",
"MRT",
"MB",
"ATRT"
)
)
ggplot(
sample_annotation,
aes(
x = reorder(GraphTumor.ordered, desc(GraphTumor.ordered)),
y = Estimated_TFx * 100,
col = ClassificationDetail,
label = labelmisclas,
shape = reorder(Biomaterial, desc(Biomaterial))
)
) +
geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point + geom_text_repel(size =
3, show.legend = FALSE) +
theme(
panel.grid.major.y = element_line(linetype = "dashed", color = "gray88"),
axis.text.y = element_text(size = 8)
) +
labs(y = "Estimated TFx (%)",
x = "Tumor type",
col = "Classification call",
shape = "Biomaterial") +
coord_flip()
This section contains all tumor fractions that are being detected per sample.
fulldeconv <-
fulldeconv %>% filter((tumor != "normal")) %>% filter((tumor != "wbc"))
fulldeconv_annot <-
left_join(x = sample_annotation,
y = fulldeconv,
by = c("SampleID" = "SampleID"))
datatable(
fulldeconv,
rownames = TRUE,
filter = "top",
options = list(pageLength = 5, scrollX = T),
caption = "The detected fractions for every sample"
)
ggplot(
fulldeconv_annot %>% filter(Estimated_TFx < 0.10),
aes(x = tumor, y = fraction, fill = ClassificationDetail)
) +
geom_bar(stat = "identity") + facet_wrap( ~ SampleID + GraphTumor, scales = "free") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(x = "tumor entity", y = "detected fraction", title = "Sample deconvolution for estimated TFx less than 10%")
ggplot(
fulldeconv_annot %>% filter((Estimated_TFx > 0.10) &
(Estimated_TFx < 30)),
aes(x = tumor, y = fraction, fill = ClassificationDetail)
) +
geom_bar(stat = "identity") + facet_wrap( ~ SampleID + GraphTumor, scales = "free") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(x = "tumor entity", y = "detected fraction", title = "Sample deconvolution for estimated TFx between 10 and 30%")
ggplot(fulldeconv_annot %>% filter((Estimated_TFx > 0.30)),
aes(x = tumor, y = fraction, fill = ClassificationDetail)) +
geom_bar(stat = "identity") + facet_wrap( ~ SampleID + GraphTumor, scales = "free") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(x = "tumor entity", y = "detected fraction", title = "Sample deconvolution for estimated TFx higher than 30%")
Samples that have a low cfDNA-HMW ratio (so samples that contain relatively more HMW DNA), seem to have a lower estimated tumor fraction. For visualisations we also binned this, where a cfDNA-HMW ratio < 1 is high HMW contamination, a cfDNA-HMW ratio between 1 - 5 is medium HMW contamination and a cfDNA-HMW ratio > 5 is low HMW contamination, so a good quality sample.
ratio_cont <-
ggplot(
sample_annotation,
aes(
y = Estimated_TFx * 100,
x = cfDNA_HMW_ratio,
col = ClassificationDetail,
shape = reorder(Biomaterial, desc(Biomaterial))
)
) + scale_x_continuous(trans = 'log10') +
geom_point(color = "black", size = 2) + geom_point(size = 1.5) + theme_point +
labs(y = "Estimated TFx (%)",
x = "cfDNA-HMW ratio (log)",
col = "Classification call",
shape = "Biomaterial") + geom_vline(xintercept = 1,
colour = "grey",
linetype = "dashed") + geom_vline(xintercept = 5,
colour = "grey",
linetype = "dashed") +
annotate(
"text",
x = 1,
y = 90,
size = 3.5,
label = "High HMW\n",
colour = "gray",
angle = -270
) +
annotate(
"text",
x = 1,
y = 90,
size = 3.5,
label = "\nMedium HMW",
colour = "gray",
angle = 90
) +
annotate(
"text",
x = 5,
y = 90,
size = 3.5,
label = "Medium HMW\n",
colour = "gray",
angle = -270
) +
annotate(
"text",
x = 5,
y = 90,
size = 3.5,
label = "\nLow HMW",
colour = "gray",
angle = 90
)
ratio_cont
ratio_binned <-
ggplot(
sample_annotation,
aes(
y = Estimated_TFx * 100,
x = cfDNA_HMW_ratio_binned,
col = ClassificationDetail,
shape = reorder(Biomaterial, desc(Biomaterial))
)
) +
geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point +
labs(y = "Estimated TFx (%)",
x = "HMW contamination",
col = "Classification call",
shape = "Biomaterial")
ratio_binned
print(
paste0(
"Classification accuracy in the low HMW group: ",
sample_annotation %>% filter(
cfDNA_HMW_ratio_binned == "Low" &
ClassificationDetail == "Correct"
) %>% nrow,
"/",
sample_annotation %>% filter(cfDNA_HMW_ratio_binned == "Low") %>% nrow
)
)
[1] "Classification accuracy in the low HMW group: 30/30"
print(
paste0(
"Classification accuracy in the medium HMW group: ",
sample_annotation %>% filter(
cfDNA_HMW_ratio_binned == "Medium" &
ClassificationDetail == "Correct"
) %>% nrow,
"/",
sample_annotation %>% filter(cfDNA_HMW_ratio_binned == "Medium") %>% nrow
)
)
[1] "Classification accuracy in the medium HMW group: 11/20"
print(
paste0(
"Classification accuracy in the high HMW group: ",
sample_annotation %>% filter(
cfDNA_HMW_ratio_binned == "High" &
ClassificationDetail == "Correct"
) %>% nrow,
"/",
sample_annotation %>% filter(cfDNA_HMW_ratio_binned == "High") %>% nrow
)
)
[1] "Classification accuracy in the high HMW group: 8/10"
print(
paste0(
"Classification accuracy in the medium + high HMW group: ",
sample_annotation %>% filter((
cfDNA_HMW_ratio_binned == "Medium" |
cfDNA_HMW_ratio_binned == "High"
) &
ClassificationDetail == "Correct"
) %>% nrow,
"/",
sample_annotation %>% filter((
cfDNA_HMW_ratio_binned == "Medium" |
cfDNA_HMW_ratio_binned == "High"
)
) %>% nrow
)
)
[1] "Classification accuracy in the medium + high HMW group: 19/30"
print("Samples with discordant CNV profiles")
[1] "Samples with discordant CNV profiles"
sample_annotation_CNAFlat <-
sample_annotation %>% filter(cfRRBS_sWGS_CNA == "Flat-CNA")
print(paste0(
"Low HMW: ",
sample_annotation_CNAFlat %>% filter(cfDNA_HMW_ratio_binned == "Low") %>% nrow()
))
[1] "Low HMW: 0"
print(paste0(
"High HMW: ",
sample_annotation_CNAFlat %>% filter(cfDNA_HMW_ratio_binned == "High") %>% nrow()
))
[1] "High HMW: 5"
print(paste0(
"Medium HMW: ",
sample_annotation_CNAFlat %>% filter(cfDNA_HMW_ratio_binned == "Medium") %>% nrow()
))
[1] "Medium HMW: 5"
print(paste0("Total Flat-CNA: ", sample_annotation_CNAFlat %>% nrow()))
[1] "Total Flat-CNA: 10"
print("Samples with concordant CNV profiles")
[1] "Samples with concordant CNV profiles"
sample_annotation_CNACNA <-
sample_annotation %>% filter((cfRRBS_sWGS_CNA == "CNA-CNA") |
(cfRRBS_sWGS_CNA == "Flat-Flat"))
sample_annotation %>% filter((cfRRBS_sWGS_CNA == "CNA-CNA")) %>% select(CNV_pearson) %>% summary
CNV_pearson
Min. :0.1300
1st Qu.:0.8175
Median :0.8750
Mean :0.8228
3rd Qu.:0.9400
Max. :0.9800
print(paste0(
"Low HMW: ",
sample_annotation_CNACNA %>% filter(cfDNA_HMW_ratio_binned == "Low") %>% nrow()
))
[1] "Low HMW: 30"
print(paste0(
"High HMW: ",
sample_annotation_CNACNA %>% filter(cfDNA_HMW_ratio_binned == "High") %>% nrow()
))
[1] "High HMW: 5"
print(paste0(
"Medium HMW: ",
sample_annotation_CNACNA %>% filter(cfDNA_HMW_ratio_binned == "Medium") %>% nrow()
))
[1] "Medium HMW: 15"
print(paste0(
"Total CNA-CNA or Flat-Flat: ",
sample_annotation_CNACNA %>% nrow()
))
[1] "Total CNA-CNA or Flat-Flat: 50"
print("Summary of correctly classified tumors:")
[1] "Summary of correctly classified tumors:"
sample_annotation %>% filter(disease_extent == "Metastatic") %>% select(Estimated_TFx) %>% summary
Estimated_TFx
Min. :0.0000
1st Qu.:0.0300
Median :0.3090
Mean :0.3345
3rd Qu.:0.5450
Max. :1.0000
datatable(
describe(
sample_annotation_CNACNA,
omit = TRUE,
quant = c(.25, .75)
),
rownames = TRUE,
filter = "top",
options = list(pageLength = 5, scrollX = T),
caption = "Descriptive statistics of concordant samples (either CNA-CNA or flat-flat)"
)
There is a good correlation between IchorCNA and cfRRBS to estimate the tumor fraction for samples with medium to low HMW contamination. Although the Spearman R seems to be higher in the Medium HMW contamination group, this is possibly due to the fact that there are less samples in that group.
ggplot(
sample_annotation,
aes(
y = Estimated_TFx * 100,
x = IchorCNA_TFx * 100,
col = cfDNA_HMW_ratio_binned,
fill = cfDNA_HMW_ratio_binned
)
) +
geom_smooth(method = "lm", se = TRUE) + geom_point(color = "black", size = 2) + geom_point(size = 1.5) + theme_point +
geom_abline(
slope = 1,
intercept = 0,
color = "gray",
linetype = "dashed"
) +
labs(
x = "CNV-based eTFx (%)",
y = "Methylation-based eTFx (%)",
col = "HMW contamination",
shape = "HMW contamination",
fill = "HMW contamination",
title = "Correlation between IchorCNA and cfRRBS-\nbased tumor fraction estimation"
) + ylim(NA, 100) + xlim(NA, 100) + stat_cor(method = "spearman") + scale_color_brewer(palette = "Greens") + scale_fill_brewer(palette =
"Greens")
If no CNAs are seen with WisecondorX, the CNV-based eTFx is close to 0.
ggplot(
sample_annotation,
aes(
y = IchorCNA_TFx * 100,
x = sWGS_CNA ,
col = cfDNA_HMW_ratio_binned,
fill = cfDNA_HMW_ratio_binned
)
) +
geom_beeswarm(color = "black", cex = 2, size = 2) + geom_beeswarm(cex = 2, size = 1.5) + theme_point +
labs(
x = "WisecondorX result",
y = "IchorCNA-based eTFx (%)",
col = "HMW contamination",
fill = "HMW contamination",
title = "IchorCNA vs WisecondorX"
)
describe(
sample_annotation %>% filter(sWGS_CNA == "Flat") %>% select(IchorCNA_TFx) %>% pull(),
omit = TRUE,
quant = c(.25, .75)
)
ggplot(
sample_annotation,
aes(
x = cfRRBS_sWGS_CNA,
y = CNV_pearson,
fill = cfDNA_HMW_ratio_binned,
size = DNA_input_binned
)
) + scale_size_discrete(range = c(1, 4)) +
geom_beeswarm(
shape = 21,
cex = 3,
color = "black",
stroke = 0.8
) + theme_point +
labs(
title = "Correlation between sWGS and cfRRBS-based \n CNV calling",
x = "cfRRBS - sWGS copy number aberrations",
y = "Pearson R",
size = "DNA input (ng)",
shape = "HMW contamination",
fill = "HMW contamination"
) + scale_fill_brewer(palette = "Greens")
ConcordantLowToMed <-
sample_annotation %>% filter((
cfDNA_HMW_ratio_binned == "Medium" |
cfDNA_HMW_ratio_binned == "Low"
) &
(cfRRBS_sWGS_CNA == "CNA-CNA" |
cfRRBS_sWGS_CNA == "Flat-Flat")
) %>% nrow()
ConcordantHigh <-
sample_annotation %>% filter((cfDNA_HMW_ratio_binned == "High") &
(cfRRBS_sWGS_CNA == "CNA-CNA" |
cfRRBS_sWGS_CNA == "Flat-Flat")
) %>% nrow()
DiscordantLowToMed <-
sample_annotation %>% filter((
cfDNA_HMW_ratio_binned == "Medium" |
cfDNA_HMW_ratio_binned == "Low"
) & (cfRRBS_sWGS_CNA == "Flat-CNA")
) %>% nrow()
DiscordantHigh <-
sample_annotation %>% filter((cfDNA_HMW_ratio_binned == "High") &
(cfRRBS_sWGS_CNA == "Flat-CNA")) %>% nrow()
FisherTest_df <-
data.frame(
row.names = c("Concordant", "Discordant"),
LowToMed = c(ConcordantLowToMed, DiscordantLowToMed),
High = c(ConcordantHigh, DiscordantHigh)
)
chisq.test(FisherTest_df)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: FisherTest_df
## X-squared = 6.936, df = 1, p-value = 0.008448
datatable(
resids_result,
rownames = TRUE,
filter = "top",
options = list(pageLength = 5, scrollX = T),
caption = "Residuals of NNLS"
)
rownames(resids_result) <- resids_result[, 1]
rownames(resids_result) <- sub("-.*", "", rownames(resids_result))
rownames(resids_result) <-
sub("snake*", "", rownames(resids_result))
tmp <-
rownames(resids_result)[(rownames(resids_result) %in% sample_annotation$SampleID)]
resids_result <- resids_result[tmp, ]
resids_result$SampleID <- rownames(resids_result)
resids_result$X1 <- NULL
resids_result <-
merge(resids_result,
sample_annotation,
by.x = "SampleID",
by.y = "SampleID")
p <-
ggplot(resids_result, aes(sample = scale(Residuals))) + stat_qq() + stat_qq_line() + labs(title = "QQ-plot of residuals") + theme_point
p
p <-
ggplot(resids_result, aes(x = scale(Residuals))) + geom_histogram(bins = 20) + labs(title = "Histogram of residuals") + theme_point
p
The regions that were selected for comparison of RRBS data with Illumina HumanMethylation 450K array data are consistently covered in all samples. Eight regions of the total 14103 regions were not covered in any of the samples (total number of regions used = 14095). A read cut-off of 30 read per region was used for all downstream analyses.
row.names(reproducibility) <- reproducibility[, 1]
reproducibility <- reproducibility[, -1]
reproducibility$`Read cutoff` <-
gsub("rco", "", reproducibility$`Read cutoff`)
reproducibility$`Read cutoff.ordered` <-
factor(reproducibility$`Read cutoff`, levels = c("1", "30", "100"))
ggplot(reproducibility,
aes(y = bin, x = count, col = `Read cutoff.ordered`)) + geom_line() + labs(y = "% of samples (n = 60)", x = "% of regions recurrently covered (n = 14095)", col = "read cutoff") + theme_point + scale_y_continuous(breaks =
seq(0, 100, 20)) + scale_x_continuous(breaks = seq(0, 100, 20))
class_results <-
ggplot(
sample_annotation,
aes(x = CorrectClassification, y = Estimated_TFx * 100, col = ClassificationDetail)
) +
geom_beeswarm(color = "black", cex = 3.5, size = 2) + geom_beeswarm(cex = 3.5, size = 1.5) + theme_point + stat_compare_means(
label = "p.signif",
comparisons = list(c("Correct", "Incorrect")),
tip.length = 0,
bracket.size = 0.5
) +
labs(y = "Estimated TFx (%)", x = "Classfication", col = "Classification call")
dz_extent <-
ggplot(sample_annotation,
aes(
x = reorder(disease_extent, desc(disease_extent)),
y = Estimated_TFx * 100,
col = ClassificationDetail
)) +
geom_beeswarm(color = "black", cex = 3.5, size = 2) + geom_beeswarm(cex = 3.5, size = 1.5) + theme_point + stat_compare_means(
label = "p.format",
comparisons = list(c(1, 2)),
tip.length = 0,
bracket.size = 0.5
) + labs(y = "Estimated TFx (%)", x = "Disease extent", col = "Classification call")
ratio_cont <-
ggplot(
sample_annotation,
aes(
y = Estimated_TFx * 100,
x = cfDNA_HMW_ratio,
col = ClassificationDetail,
shape = reorder(Biomaterial, desc(Biomaterial))
)
) + scale_x_continuous(trans = 'log10') +
geom_point(color = "black", size = 2) + geom_point(size = 1.5) + theme_point +
labs(y = "Estimated TFx (%)",
x = "cfDNA-HMW ratio (log)",
col = "Classification call",
shape = "Biomaterial") + geom_vline(xintercept = 1,
colour = "grey",
linetype = "dashed") + geom_vline(xintercept = 5,
colour = "grey",
linetype = "dashed") +
annotate(
"text",
x = 1,
y = 90,
size = 3.5,
label = "High HMW\n",
colour = "gray",
angle = -270
) +
annotate(
"text",
x = 1,
y = 90,
size = 3.5,
label = "\nMedium HMW",
colour = "gray",
angle = 90
) +
annotate(
"text",
x = 5,
y = 90,
size = 3.5,
label = "Medium HMW\n",
colour = "gray",
angle = -270
) +
annotate(
"text",
x = 5,
y = 90,
size = 3.5,
label = "\nLow HMW",
colour = "gray",
angle = 90
)
ratio_binned <-
ggplot(
sample_annotation,
aes(
y = Estimated_TFx * 100,
x = cfDNA_HMW_ratio_binned,
col = ClassificationDetail,
shape = reorder(Biomaterial, desc(Biomaterial))
)
) +
geom_beeswarm(color = "black", cex = 3.5, size = 2) + geom_beeswarm(cex = 3.5, size = 1.5) + theme_point +
labs(y = "Estimated TFx (%)",
x = "HMW contamination",
col = "Classification call",
shape = "Biomaterial")
corrCNA <-
ggplot(
sample_annotation,
aes(
y = Estimated_TFx * 100,
x = IchorCNA_TFx * 100,
col = cfDNA_HMW_ratio_binned,
fill = cfDNA_HMW_ratio_binned
)
) +
geom_smooth(method = "lm",
se = TRUE,
fullrange = FALSE) + geom_point(color = "black", size = 2) + geom_point(size = 1.5) + theme_point +
geom_abline(
slope = 1,
intercept = 0,
color = "gray",
linetype = "dashed"
) +
labs(
x = "CNV-based eTFx (%)",
y = "Methylation-based eTFx (%)",
col = "HMW contamination",
shape = "HMW contamination",
fill = "HMW contamination"
) + ylim(NA, 100) + xlim(NA, 100) + stat_cor(method = "spearman", show.legend = FALSE) + scale_color_brewer(palette = "Greens") + scale_fill_brewer(palette =
"Greens")
CNA_R <-
ggplot(
sample_annotation,
aes(x = cfRRBS_sWGS_CNA, y = CNV_pearson, col = cfDNA_HMW_ratio_binned)
) +
geom_beeswarm(color = "black", cex = 3.5, size = 2) + geom_beeswarm(cex = 3.5, size = 1.5) + theme_point +
labs(
x = "cfRRBS - sWGS copy number aberrations",
y = "Pearson R",
size = "DNA input (ng)",
shape = "HMW contamination",
fill = "HMW contamination"
) + scale_color_brewer(palette = "Greens")
grid1
ggsave(file="grid1.svg", plot=grid1, width=8, height=3.5)
ggsave(file="grid1.png", plot=grid1, width=8, height=3.5)
grid2 <- grid_arrange_shared_legend(ratio_cont ,ratio_binned, position = "right")
grid2
ggsave(file="grid2.svg", plot=grid2, width=8, height=3.5)
ggsave(file="grid2.png", plot=grid2, width=8, height=3.5)
grid3 <- grid_arrange_shared_legend(corrCNA ,CNA_R, position = "right")
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] gdtools_0.2.1 DT_0.13 asht_0.9.4 coin_1.3-1
## [5] bpcp_1.4 survival_3.1-8 exact2x2_1.6.3.1 exactci_1.3-3
## [9] ssanv_1.1 irr_0.84.1 lpSolve_5.6.15 ggbeeswarm_0.6.0
## [13] psych_1.9.12.31 httr_1.4.1 readxl_1.3.1 ggpubr_0.2.5
## [17] magrittr_1.5 plotly_4.9.2.1 RCurl_1.98-1.1 ggExtra_0.9
## [21] gridExtra_2.3 ggrepel_0.8.2 scales_1.1.0 RColorBrewer_1.1-2
## [25] broom_0.5.5 reshape2_1.4.3 forcats_0.5.0 stringr_1.4.0
## [29] dplyr_0.8.5 purrr_0.3.3 readr_1.3.1 tidyr_1.0.2
## [33] tibble_3.0.0 ggplot2_3.3.0 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.0-10 colorspace_1.4-1 ggsignif_0.6.0 ellipsis_0.3.0
## [5] modeltools_0.2-23 fs_1.4.0 rstudioapi_0.11 farver_2.0.3
## [9] fansi_0.4.1 mvtnorm_1.1-0 lubridate_1.7.4 xml2_1.3.0
## [13] codetools_0.2-16 splines_3.6.3 mnormt_1.5-6 libcoin_1.0-5
## [17] knitr_1.28 jsonlite_1.6.1 dbplyr_1.4.2 shiny_1.4.0.2
## [21] compiler_3.6.3 backports_1.1.5 assertthat_0.2.1 Matrix_1.2-18
## [25] fastmap_1.0.1 lazyeval_0.2.2 cli_2.0.2 later_1.0.0
## [29] htmltools_0.4.0 tools_3.6.3 perm_1.0-0.0 gtable_0.3.0
## [33] glue_1.3.2 Rcpp_1.0.4 cellranger_1.1.0 vctrs_0.2.4
## [37] svglite_1.2.3 nlme_3.1-144 crosstalk_1.1.0.1 xfun_0.12
## [41] rvest_0.3.5 mime_0.9 miniUI_0.1.1.1 lifecycle_0.2.0
## [45] MASS_7.3-51.5 zoo_1.8-7 hms_0.5.3 promises_1.1.0
## [49] parallel_3.6.3 sandwich_2.5-1 curl_4.3 yaml_2.2.1
## [53] stringi_1.4.6 systemfonts_0.1.1 rlang_0.4.5 pkgconfig_2.0.3
## [57] bitops_1.0-6 matrixStats_0.56.0 evaluate_0.14 lattice_0.20-38
## [61] labeling_0.3 htmlwidgets_1.5.1 tidyselect_1.0.0 plyr_1.8.6
## [65] R6_2.4.1 generics_0.0.2 multcomp_1.4-12 DBI_1.1.0
## [69] mgcv_1.8-31 pillar_1.4.3 haven_2.2.0 withr_2.1.2
## [73] modelr_0.1.6 crayon_1.3.4 rmarkdown_2.1 data.table_1.12.8
## [77] reprex_0.3.0 digest_0.6.25 xtable_1.8-4 httpuv_1.5.2
## [81] stats4_3.6.3 munsell_0.5.0 beeswarm_0.2.3 viridisLite_0.3.0
## [85] vipor_0.4.5