1 Introduction

1.1 Experimental setup

For the evaluation of the different blood collection tubes, blood was drawn from 3 healthy volunteers. The order of all blood tubes was randomized per donor. All blood tubes were filled to the volume recommended by the manufacturer.

Preservation tubes. We included 4 different preservation tubes (Streck cell-free RNA BCT, PAXgene Blood ccfDNA Tube, Roche cell-free DNA Collection tube and Biomatrica LBgard blood tube) with processing at 3 different timepoints after blood collection (t0, t24 and t72). Thus, 15 blood tubes were drawn per healthy volunteer.

Non-preservation plasma tubes. We included 1 non-preservation plasma tubes (BD Vacutainer K2E EDTA spray) with processing at 3 different timepoints after blood collection (t0, t24 and t72).

Thus, 15 blood tubes were drawn per healthy volunteer. Seven tubes were collected from one antecubital vein and 8 tubes were collected from the contralateral antecubital vein with the BD Vacutainer push button 21G, 7” tube with pre-attached holder tube butterfly needle system.

1.2 Metric selection

Several metrics were selected to assess tube effect and stability.

  1. Relative endogenous DNA content (see Relative DNA content)
  2. Genome-wide CpG methylation percentage (see Genome-wide CpG methylation)
  3. Absolute number of CpGs detected (unfiltered and filtered, see Number of CpGs)
  4. Immune content (NNLS and CelFiE, see Meth atlas and CelFiE/ENCODE as reference set)

Furthermore, in order to select the tubes which remains most stable across time points, we calculated for every tube and donor the fold change across different timepoints (excluding T24-T72). Thus, six fold-change values were obtained per tube. These were visualized and tubes were ordered based on the mean of these six values.

1.3 RMarkdown set-up

First, basic parameters are set up in this RMarkdown, such as loading dependencies, setting paths and setting up a uniform plot structure.

Pdf versions of most plots are in this github repository under ./plots/

knitr::opts_chunk$set(warning=FALSE, message=FALSE, cache = TRUE)
library(gridExtra)
require(tidyverse)
require(reshape2)
require(broom)
require(DT)
require(RColorBrewer)
require(ggpubr)
require(ggbeeswarm)
require(scales)
require(ggrepel)
require(zoo)
require(ggExtra)
require(ggplot2)
require(RCurl)
require(readxl)
require(plotly)
require(nnls)
require(httr)
require(psych)
require(ggbeeswarm)
require(irr)
require(asht)
require(DT)
require(readr)
source("HelperFunctions.R")

reScale <- function(x){(x-min(x))/(max(x)-min(x))}

remove_geom <- function(ggplot2_object, geom_type) {
  # Delete layers that match the requested type.
  layers <- lapply(ggplot2_object$layers, function(x) {
    if (class(x$geom)[1] == geom_type) {
      NULL
    } else {
      x
    }
  })
  # Delete the unwanted layers.
  layers <- layers[!sapply(layers, is.null)]
  ggplot2_object$layers <- layers
  ggplot2_object
}

# plot style
theme_point<-theme_bw()+theme(strip.background = element_blank())
theme_bar<-theme_bw()+theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1),strip.background = element_blank(),axis.ticks.x = element_blank(),axis.line.x = element_blank())
theme_boxplot<-theme_bw()+theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1),strip.background = element_blank(),axis.ticks.x = element_blank(),axis.title.x = element_blank(),axis.line.x = element_blank(),legend.position = "none")

color_panel<-c("#e35d6a","#5bb75b","#428bca","#e87810","#23496b","#ffbf00","#cc2028","#039748","pink","gray","darkgray")

color_panel1 <-  c("#039748","#039748","#ffbf00","#ffbf00","#e35d6a","#e35d6a","#5bb75b","#5bb75b","#428bca","#428bca","#23496b","#23496b","#cc2028","#cc2028","#e87810")

color_panel2 <-  brewer.pal(6, name = "Paired")
names(color_panel2) <- c("Biomatrica", "EDTA","RNA Streck","Roche","DNA Streck", "Paxgene")

full_nr <- format_format(big.mark = ",", decimal.mark = ".", scientific = FALSE)
options(scipen=10000)


# Global variables
data_path <- "../data/"
readcutoff <- 15

#bismark_meth <- read_tsv("../data/donor2/multiqc_bismark_methextract.txt")
#bismark_meth$`Sample` <- gsub("_.*", "", bismark_meth$`Sample`)

bismark_aligned <- read_tsv("../data/20200120_multiqc_data/multiqc_bismark_alignment.txt")
bismark_aligned$`Sample` <- gsub("_.*", "", bismark_aligned$`Sample`)
bismark_aligned <- bismark_aligned[!grepl("lambda", bismark_aligned$`Sample`),]
#bismark_aligned$`Category` <- gsub("_.*", "", bismark_aligned$`Category`)

bismark_aligned_lambda <- read_tsv("../data/20200120_multiqc_data/multiqc_bismark_alignment.txt")
bismark_aligned_lambda$`Sample` <- gsub("_.*", "", bismark_aligned_lambda$`Sample`)
bismark_aligned_lambda <- bismark_aligned_lambda[grepl("lambda", bismark_aligned_lambda$`Sample`),]
bismark_aligned_lambda$`Sample` <- gsub("lambda.", "", bismark_aligned_lambda$`Sample`)
#bismark_aligned$`Category` <- gsub("_.*", "", bismark_aligned$`Category`)
colnames(bismark_aligned_lambda) <- paste(colnames(bismark_aligned_lambda), sep = ".", "lambda")

bismark_aligned <- merge(bismark_aligned, bismark_aligned_lambda, by.x = "Sample", by.y = "Sample.lambda")

picard_hsmetrics <- read_tsv("../data/20200120_multiqc_data/multiqc_picard_HsMetrics.txt")#[-1,]
picard_hsmetrics$`Sample` <- gsub("_.*", "", picard_hsmetrics$`Sample`)
picard_hsmetrics$`Prct on bait` <- as.numeric(picard_hsmetrics$`ON_BAIT_BASES`)/as.numeric(picard_hsmetrics$`PF_UQ_BASES_ALIGNED`)

picard_dups <- read_tsv("../data/20200120_multiqc_data//multiqc_picard_dups.txt")#[-1,]
picard_dups$`Sample` <- gsub("_.*", "", picard_dups$`Sample`)
picard_dups <- picard_dups %>% filter(!grepl("^lambda", picard_dups$`Sample`))

sample_annotation<-read_excel("./annotation_RVP1808.xlsx")
sample_annotation <- merge(sample_annotation, bismark_aligned, by.x = "UniqueID", by.y = "Sample")
sample_annotation <- merge(sample_annotation, picard_hsmetrics, by.x = "UniqueID", by.y = "Sample")
sample_annotation <- merge(sample_annotation, picard_dups, by.x = "UniqueID", by.y = "Sample")
sample_annotation <- as_tibble(sample_annotation)
sample_annotation$pipeline <- "Bismark"
sample_annotation$Evolution <- paste0(sample_annotation$Tube, "_", sample_annotation$BiologicalReplicate, "_", sample_annotation$pipeline)
sample_annotation$MappedVsLambda <- (sample_annotation$aligned_reads + sample_annotation$ambig_reads )/(sample_annotation$aligned_reads.lambda) 
sample_annotation$MappingPrct = as.numeric(sample_annotation$`aligned_reads`)/(as.numeric(sample_annotation$`aligned_reads`)+ as.numeric(sample_annotation$`ambig_reads`)+as.numeric(sample_annotation$`discarded_reads`))*100

sample_annotation$Prct_on_bait = as.numeric(sample_annotation$`ON_BAIT_BASES`)/as.numeric(sample_annotation$`PF_BASES_ALIGNED`)*100

#describe(sample_annotation %>% mutate(Femto_pg =FEMTO_conc_nguL*1000) %>% select(Femto_pg, Tube), quant = c(.25,.75), na.rm = TRUE)
#sample_annotation$MappedVsLambda <- (sample_annotation$percent_aligned  )/(sample_annotation$percent_aligned.lambda) 

2 Femto PULSE

For all samples, we performed quality control with capillary electrophoresis, indicating the presence of a high content of HMW DNA in EDTA at T72.

ladder <- read_tsv("../data/femto_data/ladder.txt")
colnames(ladder) <- c("time_sec", "ladder" )
ladder$time_sec <- as.character(ladder$time_sec )
df_melt <- ladder %>% melt()
df_melt<- as_tibble(df_melt)
df_melt$time_sec <- as.numeric(df_melt$time_sec)
p <- ggplot(df_melt, aes(x = time_sec/60, color = variable, y = value)) +
  geom_line() + theme_point + 
  labs(title = "ladder", y = "RFU", x = "time (min)", color = "condition")

time_ladder <- c("946.63100", "1096.73000", "1230.82000", "1362.91000", "1491.99000", "1612.07000", "1712.14000", "1797.20000", "1878.25000", "1945.30000", "2001.33000", "2081.39000", "2158.44000", "2208.47000", "2264.51000", "2361.57000")
size_ladder <- c(1,100,200,300,400,500,600,700,800,900,1000,1200,1500,2000,3000,6000)

time_ladder <- c("946.63100", "1096.73000", "1230.82000", "1362.91000", "1491.99000", "1612.07000", "1797.20000", "2001.33000", "2158.44000", "2264.51000", "2361.57000")
size_ladder <- c(1,100,200,300,400,500,700,1000,1500,3000,6000)

ladder_df <- data.frame(time_sec = time_ladder, size_ladder = size_ladder)
ladder_df <- as_tibble(ladder_df)
ladder_df$time_sec <- as.character(ladder_df$time_sec)
ladder_df_melt <- ladder_df %>% melt()

makePlot <- function(input, tubetype, donor, ylim = c(-100,4000), offset_sec = 0){
  input <- input %>% select(-c(`Time (HH:MM:SS) - Plot 1`, `Time (HH:MM:SS) - Plot 2`))
  colnames(input) <- c("time_sec", paste0(donor, " - T0"), paste0(donor, " - T24"), paste0(donor, " - T72") )
  input$time_sec <- as.character(input$time_sec )
  df_melt <- input %>% melt()
  df_melt<- as_tibble(df_melt)
  df_melt$time_sec <- as.numeric(df_melt$time_sec)
  p <- ggplot(df_melt %>% filter(time_sec > 800), aes(x = time_sec/60 - offset_sec/60, color = variable, y = value)) +
            geom_line() + theme_point + 
            lims(y = ylim)+
            theme(axis.text.x = element_text(angle = 90, hjust = 1))+
            scale_x_continuous(breaks = as.numeric(time_ladder)/60, labels = size_ladder)+
            labs(title = paste0(tubetype, " - ", donor), y = "RFU", x = "length (bp)", color = "condition")
  p
}

3 Sample annotation

An overview of the samples included in this experiment.

# fold change function + make plots
generateFC <- function(input_df, variable, dfName, AC){
fold_change_input_all <- data.frame() 
for (duplicate_type in unique(sample_annotation$Tube)){
  sample_duplicates<-sample_annotation%>% filter(Tube==duplicate_type)
  if(nrow(sample_duplicates)>1){
    #print(duplicate_type)
    #double_positives_sample <-double_positives %>% dplyr::select(MIMATID,sample_duplicates$UniqueID) 
    input_df_sample <- left_join(sample_duplicates, input_df)
    # only keep the replicates of one type
   
    samples_comb <- combn(sample_duplicates$UniqueID,2) #compare 2 of the 3 samples at a time
    for (n_col in 1:ncol(samples_comb)) {
      #print(samples_comb[,n_col])
      nr_runA <- gsub("^[A-Z]+","",sample_annotation[sample_annotation$UniqueID== samples_comb[1,n_col],]$TimeLapse)
      nr_runB <- gsub("^[A-Z]+","",sample_annotation[sample_annotation$UniqueID== samples_comb[2,n_col],]$TimeLapse)
      RepA <- sample_annotation[sample_annotation$UniqueID== samples_comb[1,n_col],]$BiologicalReplicate
      RepB <- sample_annotation[sample_annotation$UniqueID== samples_comb[2,n_col],]$BiologicalReplicate
     if ((RepA == RepB)){
      varname <- paste0("T",nr_runA,"_",nr_runB) #make a name so you can backtrace which replicates are compared
      #print(varname)
      compareVar <- input_df %>% filter(UniqueID == paste(samples_comb[1,n_col]) | UniqueID == paste0(samples_comb[2,n_col])) 
      if (AC == TRUE){
      plot = "absolute change"
      intercept = 0
      if (is.na((abs(compareVar[[variable]][1]) > abs(compareVar[[variable]][2])))){
        correlation_samples<- compareVar %>%
        mutate(foldchange=NA)
      }
      else if (abs(compareVar[[variable]][1]) >= abs(compareVar[[variable]][2])){
        correlation_samples<- compareVar %>%
        mutate(foldchange= abs(compareVar[[variable]][1])-abs(compareVar[[variable]][2]))
      } else {
        correlation_samples<- compareVar %>%
        mutate(foldchange= abs(compareVar[[variable]][2])-abs(compareVar[[variable]][1]))
      }}
      else if (AC == FALSE){
      plot = "fold change"
      intercept = 1
      if (is.na((abs(compareVar[[variable]][1]) > abs(compareVar[[variable]][2])))){
        correlation_samples<- compareVar %>%
        mutate(foldchange=NA)
      }
      else if (abs(compareVar[[variable]][1]) >= abs(compareVar[[variable]][2])){
        correlation_samples<- compareVar %>%
        mutate(foldchange= abs(compareVar[[variable]][1])/abs(compareVar[[variable]][2]))
      } else {
        correlation_samples<- compareVar %>%
        mutate(foldchange= abs(compareVar[[variable]][2])/abs(compareVar[[variable]][1]))
      }}
      
      FC_input<-correlation_samples %>% 
        mutate(Tube=duplicate_type, Replicates=varname, BiologicalReplicate = paste0(RepA,"-",RepB)) %>%
        mutate(ReplID = paste0(Tube,"-",Replicates))
      FC_input <- FC_input %>% mutate(inputType = paste0(dfName))
      if ((FC_input$TimeLapse != c("T24", "T72"))){
      fold_change_input_all <- rbind(fold_change_input_all, as.data.frame(FC_input))}
      }
    }
  }
}
FC <- fold_change_input_all %>% dplyr::group_by(Tube,Replicates,BiologicalReplicate) %>%
  mutate(ReplID = paste0(Tube,"-",Replicates)) %>% select(c(ReplID, foldchange, Replicates, inputType)) %>% distinct(ReplID, .keep_all = TRUE)
  

print(ggplot(FC, aes(x=reorder(Tube,foldchange, FUN = function(x) -mean(x, na.rm = TRUE)), y = foldchange)) + geom_boxplot() + geom_beeswarm(groupOnX=TRUE, aes(col = Replicates))  + geom_hline(yintercept = intercept, linetype = "dashed", color = "gray") + labs(subtitle = "ordered by mean", title = paste0(plot, " of ", dfName), y = paste0(plot), x = "Tube", col = "Comparison") + scale_color_hue(labels = c("0h vs 24h", "0h vs 72h")) +
  scale_fill_manual(values=color_panel) + theme_point+
  stat_summary(
    geom = "point",
    fun.y = "mean",
    col = "black",
    size = 2,
    shape = 24,
    fill = "white"
  )) 
return(FC)
}

4 Sequencing and preprocessing QC

All samples were sequenced on a Novaseq6000 SP flow cell, paired-end, 2x50 cycles, in three runs. The order of the tubes was randomized across sequencing runs. Run metrics look nominal.

  1. NVQ_Run037
  • % ≥Q30: 94%
  • % clusters PF: 83%
  • Yield: 123.22 Gbp
  1. NVQ_Run047
  • % ≥Q30: 93.39%
  • % clusters PF: 82.30%
  • Yield: 119.84 Gbp
  1. NVQ_Run051
  • % ≥Q30: 93.99%
  • % clusters PF: 82.80%
  • Yield: 120.46 Gbp

4.1 Pipeline parameters and MultiQC reports

MultiQC reports of the pipeline can be found in this github repository under data/20200120_multiqc_report.html

The script of the pipeline to see the parameters can be found on ./code/preprocessing. Other supporting files and documentation on the meth atlas deconvolution can be found at https://github.com/rmvpaeme/cfRRBS_manuscript.

4.2 Number of reads in different stages of the pipeline

All reads were subsampled to 18M reads so that there is a fair comparison between all metrics (e.g. if one sample is sequenced deeper it will probably yield more CpGs compared to a sample that was sequenced less deep).

The reads were counted on the fastq level and plotted in the following plots.

4.3 Total reads after subsampling

4.5 Bisulfite conversion efficiency

Bisulfite conversion is assessed by evaluating the CHH methylation, which is normally unmethylated. Theoretically, 100 - CHH methylation % should be 100, as this equals an 100% conversion efficiency. In practice, this should be >99%. We can see from our data that bisulfite conversion was successful in every sample.

4.6 Mapping efficiency

Mapping efficiency is as expected (between 60-70%). In EDTA tubes, the mapping efficiency increases with time, presumably due to WBC degradation.

4.7 Percentage on bait

Over 80% of the reads map within the MspI regions, this is not timepoint or tube dependent.

5 Tube content and overall CpG methylation

5.1 Absolute DNA content

The absolute DNA content was measured with the FEMTO Pulse for all samples. Two microliter of the eluate was used.

5.2 Relative DNA content

  • If we estimate the relative DNA content by looking at the mapped reads vs lambda spike-in (which are added before library preparation), we can see that DNA content increases at T72 in EDTA, while decreasing in the preservation tubes (which we also saw on the Femto).
  • Overall, preservation tubes also seem to contain less DNA.
EndoVsLambda <- function(Donor, Comparison){
  DNAcontent <- sample_annotation %>% filter(BiologicalReplicate == Donor)
  test <- log_rescaling_CI(DNAcontent, measurevar="MappedVsLambda", groupvar=c("SampleName"))
  
  rescale <- test %>% filter(SampleName == Comparison) %>% dplyr::select(measurevar_log_resc)
  rescale <- unlist(rescale)
  
  
  p1 <- ggplot(test,aes(x=TimeLapse,y=(10^measurevar_log_resc/10^rescale), color = Tube, group = Evolution))+
    geom_point()+geom_line()+
    theme_point +
    labs(y = "Relative endogenous DNA concentration", title="DNA concentration", subtitle = paste0("Endogenous DNA vs Lamda (rescaled vs ", Comparison, ")"))+  
    scale_y_log10(limits = c(0.1,20)) +
    #ylim(0.1,15)+
    geom_hline(yintercept = 1, linetype="dashed",colour="grey88") +
    scale_color_manual(values=color_panel)
  
  p1
}

#plot_data <- bind_rows(read_csv("../bwa_data/MappedVsLambda_bwa.csv"), sample_annotation)
sample_annotation$MappedVsLambda_pg_equivalent <- (sample_annotation$MappedVsLambda*5)/11.1
sample_annotation$MappedVsLambda_ng_equivalent <- (sample_annotation$MappedVsLambda*0.005)/11.1

sample_annotation$ngDNA_per_mL_plasma_lambda <- sample_annotation$MappedVsLambda_ng_equivalent*75/(as.numeric(sample_annotation$PlasmaInputV)/1000)

p_endoDNA <- ggplot(sample_annotation,aes(x=TimeLapse,y=(ngDNA_per_mL_plasma_lambda), col=BiologicalReplicate, group = Evolution))+
  geom_point(size = 2)+geom_line()+
  theme_point+
  facet_wrap(~Tube, ncol = 5, scales="free_x") +
  theme(panel.grid.major.x=element_line(linetype = "dashed",color="gray88"), axis.text.x=element_text(size=8)) +
  scale_color_manual(values=color_panel) +
  theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) + labs(title="Relative endogenous DNA content", y = "cfDNA ng/mL plasma (lambda eq.)")

p_endoDNA

6 Reading in CpG methylation calls

The next block imports all coverage files and merges them into a single dataframe. The coverage files from this manuscript can be found at ArrayExpress identifier ####.

6.1 PCA plots

  • Most samples seem to cluster close together, except for EDTA T72 (as expected on the FEMTO profile) and except for Roche T24 (not expected on the FEMTO profile).
  • No other subclusters are present based on tube or timepoint

6.2 Number of CpGs

nCpGs_filtered <- as.data.frame(sapply( cov_df_DNA_ID, function(f) sum(complete.cases(f))))
colnames(nCpGs_filtered) <- c("number_of_cpgs_filtered")
nCpGs_filtered <- as_tibble(cbind(UniqueID = rownames(nCpGs_filtered), nCpGs_filtered))
nCpGs_filtered <- merge(nCpGs_filtered, sample_annotation)

# Unfiltered data
nCpGs_unfiltered <- data.frame("UniqueID" = character(), "number_of_cpgs_unfiltered" = numeric())

files<-list.files(data_path,recursive=TRUE)
files<-files[grep("18.cov$", files)]
for(i in 1:length(files)){
  #tmp<-read.table(paste0(data_path,files[i]), header=T, sep="\t")
  tmp<-data.table::fread(paste0(data_path,files[i]), header=F, sep="\t", data.table=FALSE)
  colnames(tmp) <- c("chr","start","stop","beta","meth","unmeth")
  percent_cpg_meth_unfiltered <- mean(tmp$beta)
  percent_cpg_meth_filtered <- tmp %>% filter(meth+unmeth >= readcutoff) %>% summarise(mean = mean(beta)) %>% pull(mean)
  #name_sample<-gsub(".*/","",sub("*_S[0-9]{1,2}_R1_subsamp10M.cov","",sub("*_S[0-9]{1,2}_R1_subsamp10M.cov","",files[i])))
  name_sample<-gsub("_.*", "", files[i])
  nCpGs <- nrow(tmp)
  nCpGs <- data.frame("number_of_cpgs_unfiltered" = nCpGs, "UniqueID" = name_sample, "percent_cpg_meth_unfiltered" = percent_cpg_meth_unfiltered, "percent_cpg_meth_filtered" = percent_cpg_meth_filtered)
  nCpGs_unfiltered <- rbind(nCpGs, nCpGs_unfiltered)  
  #cov_df_unfiltered<-full_join(cov_df_unfiltered,tmp1,by=c("CpG_loc"))
}

nCpGs <- merge(nCpGs_filtered, nCpGs_unfiltered)
nCpGs$number_of_cpgs_filtered <- as.numeric(nCpGs$number_of_cpgs_filtered)

6.2.1 Filtered

6.2.2 Unfiltered

6.3 Genome-wide CpG methylation

6.3.1 Unfiltered

7 Reproducibility with ALC

7.1 Interpretation

The ALC is a measure of reproducibility per CpG.

  • If there are two replicates, the reproducibility reaches a maximum if the difference between two measurements is 0. If it’s not zero, it means that the reproducibility is lower.
  • If we calculate this for all detected CpGs between two replicates and make a cumulative plot, then the area between the curve and x = 0 (the area left of the curve) gives an indication of reproducibility.

7.2 Individual ALC plots

cov_df_DNA_ID <- cov_df_DNA_ID %>% drop_na()
p = list()
ALC_input_all <- data.frame() 
for (duplicate_type in unique(sample_annotation$Tube)){
  sample_duplicates<-sample_annotation%>% filter(Tube==duplicate_type) %>%
    dplyr::select(UniqueID,TimeLapse,Tube, BiologicalReplicate)
  if(nrow(sample_duplicates)>1){
    #print(duplicate_type)
   # double_positives_sample<-double_positives %>% dplyr::select(ensembl_gene_id,sample_duplicates$UniqueID) # only keep the replicates of one type
   
    samples_comb <- combn(sample_duplicates$UniqueID,2) #compare 2 of the 3 samples at a time
    for (n_col in 1:ncol(samples_comb)) {
      #print(samples_comb[,n_col])
      nr_runA <- gsub("^[A-Z]+","",sample_annotation[sample_annotation$UniqueID== samples_comb[1,n_col],]$TimeLapse)
      nr_runB <- gsub("^[A-Z]+","",sample_annotation[sample_annotation$UniqueID== samples_comb[2,n_col],]$TimeLapse)
      RepA <- sample_annotation[sample_annotation$UniqueID== samples_comb[1,n_col],]$BiologicalReplicate
      RepB <- sample_annotation[sample_annotation$UniqueID== samples_comb[2,n_col],]$BiologicalReplicate
     if ((RepA == RepB)){
      varname <- paste0("Rep",nr_runA,"_",nr_runB) #make a name so you can backtrace which replicates are compared
      #print(varname)
      correlation_samples<-cov_df_DNA_ID  %>%
        mutate(ratio=abs(get(samples_comb[1,n_col])-get(samples_comb[2,n_col]))) %>%
        dplyr::select(ratio,CpG_loc) #%>% drop_na()
      ALC_input<-correlation_samples %>% arrange(ratio) %>% # order by log2 ratio and then make a rank (counter) and rescale this to 1 (perc_counter)
        #mutate(rank=percent_rank(ratio)) %>% # this does not work: gives everything with log2ratio = 0 rank 0
        mutate(counter = seq(1:nrow(cov_df_DNA_ID)), Tube=duplicate_type, Replicates=varname, BiologicalReplicate = paste0(RepA,"-",RepB)) %>%
        mutate(ReplID = paste0(Tube,"-",Replicates), perc_counter = counter/nrow(cov_df_DNA_ID))
     
      
     # if (!(c("Rep24_72", "Rep72_24") %in% ALC_input_all$Replicates)){
      ALC_input_all <- rbind(ALC_input_all, ALC_input)
     # }
      }
    }
  }
}
ALC_input_all <- ALC_input_all %>% filter(!(Replicates %in% c("Rep24_72", "Rep72_24")))
max_ALC <-max(ALC_input_all$ratio) # calculate the max ALC over everything (necessary for area calculation -> should always be the same in order to compare among kits)
ALC <- ALC_input_all %>% dplyr::group_by(Tube,Replicates,BiologicalReplicate) %>%
  dplyr::summarise(ALC_calc = sum(ratio)/(max_ALC)) %>%
  #dplyr::summarise(ALC_calc = sum(ratio)/(max_ALC)) %>%
  mutate(ReplID = paste0(Tube,"-",Replicates))
ALC$ALC_scaled <- (ALC$ALC_calc - min(ALC$ALC_calc))/diff(range(ALC$ALC_calc))
ALC_df <- NULL

for (replicates in unique(ALC_input_all$ReplID)) { 
  for (techrep in c("DONOR1-DONOR1", "DONOR2-DONOR2", "DONOR3-DONOR3")){
   # replicates = "DNA Streck-Rep24_0"     
  #  techrep = "TUBE1-TUBE1" # plot the ALC (= the colored part of the plot)
    tmp <- ALC_input_all %>% dplyr::filter(ReplID==replicates)  %>%
                 dplyr::filter(BiologicalReplicate == techrep)
    
    y <- ALC_input_all %>% dplyr::filter(ReplID==replicates)  %>%
                 dplyr::filter(BiologicalReplicate == techrep) %>% pull(ratio)
    y <- y/max_ALC
    x <- ALC_input_all %>% dplyr::filter(ReplID==replicates)  %>%
                 dplyr::filter(BiologicalReplicate == techrep) %>% pull(perc_counter)
    id <- order(x)
    
    AUC <- sum(diff(x[id])*rollmean(y[id],2))
   # ALC_value <- 1-AUC
    tmp_1 <- data.frame(UniqueID = paste0(techrep, "_", replicates), ALC_rescaled = AUC)
    ALC_df <- rbind(tmp_1, ALC_df)
    
  if (nrow(tmp) != 0){ 
  plot_ID <- paste0(replicates, " - ", techrep)
  tube_id <- strsplit(replicates, "-")[[1]][1]
  time_id <- strsplit(replicates, "-")[[1]][2]
  time_id <- sub("Rep", "", time_id)
  time_id <- paste0(sub("_", "h vs ", time_id), "h")
  donor_id <- strsplit(techrep, "-")[[1]][1]
  donor_id <- sub("DONOR", "D",donor_id)
  p[[plot_ID]] <- ggplot( tmp %>%
          mutate(ratio_resc = ratio/max_ALC))+
        geom_line(aes(x=ratio_resc,y=perc_counter))+
        #facet_wrap(~ReplID) +
        geom_ribbon(aes(x=ratio_resc,ymin=perc_counter,ymax=1), fill="lightblue")+
        geom_hline(aes(yintercept = 1))+
        theme_classic()+
        scale_x_continuous(limits=c(0,1)) +
        scale_y_continuous(expand = c(0, 0)) +
        theme(legend.position = "none") +
        theme(axis.text.x = element_text(angle = 90, hjust = 1), plot.title = element_text(size = 14), plot.subtitle = element_text(size = 14)) +
        labs(title=paste0(tube_id, " ", donor_id, "\n", time_id),
             #subtitle=paste("ALC:", round(ALC %>% ungroup() %>% dplyr::filter(ReplID==replicates)  %>%
             #    dplyr::filter(BiologicalReplicate == techrep) %>% select(ALC_calc),2)), #print ALC for this particular comparison!
             subtitle=paste("ALC:", round(AUC,4)), #print ALC for this particular comparison!
             y=" ",x=" ")
    }
  }
}
ALC$UniqueID <- paste0(ALC$BiologicalReplicate, "_", ALC$ReplID)
ALC <- merge(ALC, ALC_df)

figure <- ggarrange(plotlist = p, common.legend = TRUE)
annotate_figure(figure, left = "rank", bottom = "rescaled difference")

8 Meth atlas

In order to estimate immune content, deconvolution with meth atlas was done in a similar fashion as in a previous preprint (https://www.biorxiv.org/content/10.1101/795047v1). More information is available at https://github.com/rmvpaeme/cfRRBS_manuscript.

The deconvolution script was edited to select the top 100 hyper and hypomethylated regions.

## [1] 20.53333

## [1] 19.53333
## [1] 19.53333

9 CelFiE

9.1 Meth atlas as reference set

CelFiE was run with the following parameters:

data = "../data/deconvolution_results/input_for_celfie/methatlas_sample_reference_tims.txt"
num_samples = 45
iterations = 1000
num_unk = 1
iteration_number = 1
convergence_criteria = 0.001
num_random_restart = 1

The sample and reference order for the methatlas_sample_reference_tims.txt can be found at ../data/deconvolution_results/input_for_celfie/methatlas_reference_file_key.txt and ../data/deconvolution_results/input_for_celfie/methatlas_sample_key.txt

MAIN CONCLUSION CelFiE and the meth-atlas reference set are not compatible.

9.2 CelFiE/ENCODE as reference set

CelFiE was run with the following parameters:

data = "../data/deconvolution_results/input_for_celfie/rrbs_sample_reference_tims.txt
num_samples = 45
iterations = 1000
num_unk = 1
iteration_number = 1
convergence_criteria = 0.001
num_random_restart = 1

The sample and reference order for the rrbs_sample_reference_tims.txt can be found at ../data/deconvolution_results/input_for_celfie/celfie_reference_file_key.txt and ../data/deconvolution_results/input_for_celfie/celfie_sample_key.txt

12 Change summary

13 Session info

## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] gdtools_0.2.1      pheatmap_1.0.12    plyr_1.8.6         asht_0.9.4        
##  [5] coin_1.3-1         bpcp_1.4           survival_3.1-8     exact2x2_1.6.3.1  
##  [9] exactci_1.3-3      ssanv_1.1          irr_0.84.1         lpSolve_5.6.15    
## [13] psych_1.9.12.31    httr_1.4.1         nnls_1.4           plotly_4.9.2.1    
## [17] readxl_1.3.1       RCurl_1.98-1.1     ggExtra_0.9        zoo_1.8-7         
## [21] ggrepel_0.8.2      scales_1.1.0       ggbeeswarm_0.6.0   ggpubr_0.2.5      
## [25] magrittr_1.5       RColorBrewer_1.1-2 DT_0.13            broom_0.5.5       
## [29] reshape2_1.4.3     forcats_0.5.0      stringr_1.4.0      dplyr_0.8.5       
## [33] purrr_0.3.3        readr_1.3.1        tidyr_1.0.2        tibble_3.0.0      
## [37] ggplot2_3.3.0      tidyverse_1.3.0    gridExtra_2.3     
## 
## loaded via a namespace (and not attached):
##  [1] TH.data_1.0-10     colorspace_1.4-1   ggsignif_0.6.0     ellipsis_0.3.0    
##  [5] modeltools_0.2-23  fs_1.4.0           rstudioapi_0.11    farver_2.0.3      
##  [9] fansi_0.4.1        mvtnorm_1.1-0      lubridate_1.7.4    xml2_1.3.0        
## [13] codetools_0.2-16   splines_3.6.3      mnormt_1.5-6       libcoin_1.0-5     
## [17] knitr_1.28         jsonlite_1.6.1     dbplyr_1.4.2       shiny_1.4.0.2     
## [21] compiler_3.6.3     backports_1.1.5    assertthat_0.2.1   Matrix_1.2-18     
## [25] fastmap_1.0.1      lazyeval_0.2.2     cli_2.0.2          later_1.0.0       
## [29] htmltools_0.4.0    tools_3.6.3        perm_1.0-0.0       gtable_0.3.0      
## [33] glue_1.3.2         Rcpp_1.0.4         cellranger_1.1.0   vctrs_0.2.4       
## [37] svglite_1.2.3      nlme_3.1-144       crosstalk_1.1.0.1  xfun_0.12         
## [41] rvest_0.3.5        mime_0.9           miniUI_0.1.1.1     lifecycle_0.2.0   
## [45] MASS_7.3-51.5      hms_0.5.3          promises_1.1.0     parallel_3.6.3    
## [49] sandwich_2.5-1     yaml_2.2.1         stringi_1.4.6      systemfonts_0.1.1 
## [53] rlang_0.4.5        pkgconfig_2.0.3    bitops_1.0-6       matrixStats_0.56.0
## [57] evaluate_0.14      lattice_0.20-38    labeling_0.3       htmlwidgets_1.5.1 
## [61] cowplot_1.0.0      tidyselect_1.0.0   R6_2.4.1           generics_0.0.2    
## [65] multcomp_1.4-12    DBI_1.1.0          pillar_1.4.3       haven_2.2.0       
## [69] withr_2.1.2        modelr_0.1.6       crayon_1.3.4       rmarkdown_2.1     
## [73] grid_3.6.3         data.table_1.12.8  reprex_0.3.0       digest_0.6.25     
## [77] xtable_1.8-4       httpuv_1.5.2       stats4_3.6.3       munsell_0.5.0     
## [81] beeswarm_0.2.3     viridisLite_0.3.0  vipor_0.4.5