Mapping genetic effects of diverse mESC lines on cellular phenotypes with Census-seq

Author

Maddie Rea

Published

August 5, 2024

Welcome to the online companion to Madeline Rea’s 2024 Jackson Lab Summer Student Program (SSP) poster! As a note, if you are viewing this on a phone, the webpage is best rendered in landscape mode (long-ways/hot dog). You can press the “Code” drop downs to see exactly how I got the graphs created here.

Code

library(ggplot2)
library(MASS) 
library(reshape2) 
library(reshape) 
library(patchwork)
library(ggpubr)
library(plotly)
library(dplyr)
library(readr)
library(crayon)
library(pheatmap)

Proportion versus Percent Error

Sixteen samples were randomly chosen from the pool of forty-eight and were proportioned in equal, logarithmic, and “random” proportions. Each bar represents the total reads a sample contributed to the pool of three million reads, with magenta reads failing the quality control. Above the difference in estimation versus true proportion for each VCF and their corresponding percent error is graphed. All samples are ordered by when they were assigned proportions (hence the order of logarithmic being perfect).

Code

compare_vcfs = function(quilt_input,geno_input,census_input,x_order,base=FALSE){
  
  census_input$V1 = sub("plexWell-", "", census_input$V1)
  census_input$V1 = sub("_GT23-023.._.*", "", census_input$V1)
  quilt_input$DONOR = sub("plexWell-", "", quilt_input$DONOR)
  quilt_input$DONOR = sub("_GT23-023.._.*", "", quilt_input$DONOR)
  geno_input$DONOR = sub("plexWell-", "", geno_input$DONOR)
  geno_input$DONOR = sub("_GT23-023.._.*", "", geno_input$DONOR)
  
  
  census_input = census_input[order(census_input$V1),]
  quilt_input$READS = census_input$V2
  meltedbar = subset(quilt_input, select = c(DONOR,READS,COUNT))
  colnames(meltedbar) <- c('DONOR','Failed','Passed')
  meltedbar = melt(meltedbar,id="DONOR")
  colnames(meltedbar) <- c('Donor','Filtered','Reads')
  
  p1 <- ggplot(data=meltedbar, aes(x=Donor,y=Reads, fill=Filtered)) +
  geom_bar(position="identity",stat="identity") +
  scale_fill_manual(values=c("#D41159", "#69C561")) +
  scale_x_discrete(limits = x_order) +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
    panel.background = element_rect(fill='transparent'), 
    plot.background = element_rect(fill='transparent', color=NA), 
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(), 
    legend.background = element_rect(fill='transparent'), 
    legend.box.background = element_rect(fill='transparent') 
  ) + 
  labs(y = "Total Reads") +
  guides(fill=guide_legend(title="Reads QC"))
  
  if (base == FALSE){
    p1 <- p1 + theme(axis.text.x=element_blank(),
                     axis.ticks.x=element_blank(),
                     axis.title.x =element_blank()) 
  }
    
  
  quilt_input$G_REP = geno_input$REPRESENTATION
  quilt_input$Genoprobs = quilt_input$G_REP - quilt_input$KNOWN
  quilt_input$Quilt = quilt_input$REPRESENTATION - quilt_input$KNOWN
  meltedline = subset(quilt_input, select = c(DONOR,Quilt,Genoprobs))
  meltedline = melt(meltedline,id="DONOR")
  colnames(meltedline) <- c('Donor','VCF','Differences')
  p2 <- ggplot(data = meltedline) +
  geom_hline(yintercept=0,linewidth=0.5)+
  geom_line(aes(x = Donor, y = Differences, colour = VCF, group = VCF)) +
  geom_point(aes(x = Donor, y = Differences, colour = VCF, group = VCF))+
  scale_x_discrete(limits = x_order) +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  scale_color_manual(values=c('#063970', '#A69943')) +
  theme(
    panel.background = element_rect(fill='transparent'), 
    plot.background = element_rect(fill='transparent', color=NA), 
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(), 
    legend.background = element_rect(fill='transparent'), 
    legend.box.background = element_rect(fill='transparent') 
  ) + labs(y = "∆ Proportion")
  
  quilt_input$Genoprobs = ((quilt_input$G_REP - quilt_input$KNOWN) / quilt_input$KNOWN) * 100
  quilt_input$Quilt = ((quilt_input$REPRESENTATION - quilt_input$KNOWN) / quilt_input$KNOWN) * 100
  meltedline_error = subset(quilt_input, select = c(DONOR,Quilt,Genoprobs))
  meltedline_error = melt(meltedline_error,id="DONOR")
  colnames(meltedline_error) <- c('Donor','VCF','PercentError')
  p3 <- ggplot(data = meltedline_error) +
  geom_hline(yintercept=0,linewidth=0.5)+
  geom_line(aes(x = Donor, y = PercentError, colour = VCF, group = VCF)) +
  geom_point(aes(x = Donor, y = PercentError, colour = VCF, group = VCF))+
  scale_x_discrete(limits = x_order) +
  scale_color_manual(values=c('#063970', '#A69943')) +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  theme(
    panel.background = element_rect(fill='transparent'), 
    plot.background = element_rect(fill='transparent', color=NA), 
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(),
    legend.background = element_rect(fill='transparent'), 
    legend.box.background = element_rect(fill='transparent')
  ) + labs(y = "Percent Error")

stacked_plots <- p2 + p3 + p1 + plot_layout(ncol = 1)
stacked_plots
}

Code

log_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_census_report.tsv",header=FALSE,sep="\t")

log_census$V1 = sub("plexWell-", "", log_census$V1)
log_census$V1 = sub("_GT23-023.._.*", "", log_census$V1)

hold = log_census[order(log_census$V2),]
hold = hold$V1

Equal Proportions

Code

equal_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_quilt.census.txt",header=TRUE,sep="\t",skip=2)
equal_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_geno.census.txt",header=TRUE,sep="\t",skip=2)
equal_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_census_report.tsv",header=FALSE,sep="\t")

compare_vcfs(equal_quilt,equal_geno,equal_census,hold,base=TRUE)

While the Δ Proportion Graph and Percent Error are nearly identical here, (because of the common denominator in the Percent Error [PE] calculation) this is not the case in the logarithmic and “random” graphs. The Genoprobs VCF notably overestimates sample M35’s contribution to the total pool, but does comparably well to the Quilt VCF otherwise.

Logarithmic Proportions

Code

log_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_quilt.census.txt",header=TRUE,sep="\t",skip=2)
log_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_geno.census.txt",header=TRUE,sep="\t",skip=2)
log_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_census_report.tsv",header=FALSE,sep="\t")

compare_vcfs(log_quilt,log_geno,log_census,hold,base=TRUE)

Here, even though both VCFs underestimate the contribution of sample M14 the percent error on this estimation is very low. This is because the denominator of the PE calculation is so large. Likewise, although Census-seq estimated samples M04 and M38 approximately, the PE on these calculations is huge because the denominator of the PE calculation is very small. Again, the Genoprobs VCF overestimates the contribution of sample M35.

“Random” Proportions

Code

random_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_quilt.census.txt",header=TRUE,sep="\t",skip=2)
random_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_geno.census.txt",header=TRUE,sep="\t",skip=2)
random_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_census_report.tsv",header=FALSE,sep="\t")
compare_vcfs(random_quilt,random_geno,random_census,hold,base=TRUE)

The reason “random” is in quotes throughout the poster and throughout this webpage is because if the samples were truly given random proportions, there is no guarantee every sample would be given reads. The python script takes in X amount of reads (in this case three million) and ensures that each sample gets, at minimum, at least 0.5% of the total reads in a pool. It also ensures that any one sample is not over 70% of the entire pool, thus it is not truly random. Census-seq does not always overestimate samples with high proportions (like M14 in the logarithmic example) as seen by sample F19 here.

Sequencing Depth Effects Error Rate

Code

sample_amounts <- c()
read_numbers <- c()
percent_errors <- c()
proportion_types <- c()

calculate_pe <- function(df) {
  df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
  av_percent_error <- mean(df$PERCENT_ERROR)
  return(av_percent_error)
}

files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)

for (file_path in files) {
  
  path_parts <- strsplit(file_path, "/")[[1]]
  sample_amount <- path_parts[length(path_parts) - 4]
  proportion_type <- path_parts[length(path_parts) - 3]
  read_number <- path_parts[length(path_parts) - 2]
  
  df <- read_tsv(file_path, skip = 2)
  percent_error <- calculate_pe(df)
  sample_amounts <- c(sample_amounts, sample_amount)
  read_numbers <- c(read_numbers, read_number)
  percent_errors <- c(percent_errors, percent_error)
  proportion_types <- c(proportion_types, proportion_type)
}

percent_errors <- as.numeric(percent_errors)

reads_to_numbers <- list(
  'h_thousand' = 100000,
  'o_million' = 1000000,
  't_million' = 10000000,
  'f_million' = 50000000,
  'h_million' = 100000000
)

read_numbers <- sapply(read_numbers, function(x) reads_to_numbers[[x]])

samples_to_numbers <- list(
  's4' = 4,
  's8' = 8,
  's16' = 16,
  's32' = 32,
  's48' = 48
)

sample_amounts <- sapply(sample_amounts, function(x) samples_to_numbers[[x]])

df <- data.frame(
  'Reads' = read_numbers,
  'Samples' = sample_amounts,
  'Percent_Error' = percent_errors,
  'Proportion_Type' = proportion_types
)

fig <- plot_ly(df, x = ~Samples, y = ~Reads, z = ~Percent_Error, color = ~Proportion_Type, colors = c('#009E73','#56B4E9','#F0E442'))
fig <- fig %>% add_markers(marker = list(opacity = 0.5))
fig <- fig %>% layout(scene = list(xaxis = list(title = 'Number of Samples'),
                     yaxis = list(title = 'Number of Reads'),
                     zaxis = list(title = 'Percent Error')))

fig

In this three dimensional plot, the number of reads in a pool is plotted against the number of samples in a pool, which is then plotted against the average percent error for each run. Logarithmic proportions tend to have the highest percent error due to the large amount of samples with very small proportions. Use the controls on the top right of the graph to move as needed!

Animated Graphs

These graphs are not on the poster due to the nature of a static poster. However, I think they illustrate that PE decreases in samples with very small proportions as the number of reads in the pool increases.

Random Proportion versus Percent Error Animated

Code

read_numbers <- c()

calculate_pe <- function(df) {
  df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
  av_percent_error <- mean(df$PERCENT_ERROR)
  return(av_percent_error)
}

files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/s48/random/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)

results <- data.frame(Proportion = numeric(), Percent_Error = numeric(), Reads = character())

for (file_path in files) {
  
  path_parts <- strsplit(file_path, "/")[[1]]
  read_number <- path_parts[length(path_parts) - 2]
  run_number <- path_parts[length(path_parts) - 1]
  
  df <- read_tsv(file_path, skip = 2)
  
  df <- df %>%
    mutate(DONOR = paste0(DONOR, "_", run_number),
           Percent_Error = abs(REPRESENTATION - KNOWN) / KNOWN * 100,
           Reads = read_number)
  
  results <- rbind(results, df)
  
}


reads_to_numbers <- list(
  'h_thousand' = 100000,
  'o_million' = 1000000,
  't_million' = 10000000,
  'f_million' = 50000000,
  'h_million' = 100000000
)

results$Reads <- sapply(results$Reads, function(x) reads_to_numbers[[x]])
results$Reads <- as.integer(results$Reads)

results$DONOR <- as.factor(results$DONOR)

fig <- results %>%
  plot_ly(
    x = ~KNOWN,
    y = ~Percent_Error,
    frame = ~Reads,
    text = ~DONOR,
    hoverinfo = "text",
    type = 'scatter',
    mode = 'markers'
  ) %>%
  layout(
    xaxis = list(title = "Proportion in Pool"),
    yaxis = list(title = "Percent Error"))

fig

I start with “Random” proportions here because I think these proportions are most representative of what an actual cell village’s distribution would look like. While some samples don’t have many reads overall, they never fall below 0.5% of the total pool. Meanwhile, some samples take up almost 50% of the pool. Pay special attention to samples with very small proportions, you can watch as they decrease in PE as reads increase.

Logistic Proportion versus Percent Error Animated

Code

read_numbers <- c()

calculate_pe <- function(df) {
  df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
  av_percent_error <- mean(df$PERCENT_ERROR)
  return(av_percent_error)
}

files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/s48/log/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)

results <- data.frame(Proportion = numeric(), Percent_Error = numeric(), Reads = character())

for (file_path in files) {
  
  path_parts <- strsplit(file_path, "/")[[1]]
  read_number <- path_parts[length(path_parts) - 2]
  run_number <- path_parts[length(path_parts) - 1]
  
  df <- read_tsv(file_path, skip = 2)
  
  df <- df %>%
    mutate(DONOR = paste0(DONOR, "_", run_number),
           Percent_Error = abs(REPRESENTATION - KNOWN) / KNOWN * 100,
           Reads = read_number)
  
  results <- rbind(results, df)
  
}


reads_to_numbers <- list(
  'h_thousand' = 100000,
  'o_million' = 1000000,
  't_million' = 10000000,
  'f_million' = 50000000,
  'h_million' = 100000000
)

results$Reads <- sapply(results$Reads, function(x) reads_to_numbers[[x]])
results$Reads <- as.integer(results$Reads)

results$DONOR <- as.factor(results$DONOR)

fig <- results %>%
  plot_ly(
    x = ~KNOWN,
    y = ~Percent_Error,
    frame = ~Reads,
    text = ~DONOR,
    hoverinfo = "text",
    type = 'scatter',
    mode = 'markers'
  ) %>%
  layout(
    xaxis = list(title = "Proportion in Pool"),
    yaxis = list(title = "Percent Error")
  )
fig

The y-axis of this graph stretches over eight times as far as the “random” distributions. This is because unlike “random” distributions, they do not have the minimum 0.5% rule. Thus, these samples can have PE upwards of 2500%. This begs the question, at what proportion can Census-seq accurately estimate contribution?

In Silico Proportions versus Inferred Proportions

To answer that question, we tested very small proportions (0.0005-0.012) with a differing number of reads. Results are very similar with the original paper’s. (Mapping genetic effects on cellular phenotypes with “cell villages” by Jana M. Mitchell et al.) All of the following were run in pools of 48 samples.

One Million Reads In Silico versus Inferred

Code

files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_one", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)

merged_dfs <- list()

for (dir in dirs) {
  # Construct the file paths
  file_path_txt <- file.path(dir, "my.census.txt")
  file_path_tsv <- file.path(dir, "census_report.tsv")
  df_txt <- read_tsv(file_path_txt, skip = 2)
  df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
  merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
  merged_df$X2 = merged_df$X2 / 1000000
  
  merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}

final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]


p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) + 
  geom_point(shape=1) + 
  geom_abline(slope=1, intercept=0,colour='#E41A1C') +
  scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
  scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
  theme_minimal()


final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100

final_df_one = final_df %>%
  group_by(X2) %>%
  mutate(Median_Percent_Error = median(Percent_Error))

p2 <- ggplot(final_df_one, aes(x = X2, y = Median_Percent_Error)) + 
  geom_point() + 
  geom_line() + 
  scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
  scale_y_continuous(name="Median Percent Error", limits=c(0, 150)) +
  theme_minimal()

p1 + p2

These reads are the reads submitted into Census-seq, thus this graph does not consider the reads that did not pass. That graph is below.

One Million Reads Passed Reads versus Inferred

Code

files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_one", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)

merged_dfs <- list()

for (dir in dirs) {
  # Construct the file paths
  file_path_txt <- file.path(dir, "my.census.txt")
  file_path_tsv <- file.path(dir, "census_report.tsv")
  df_txt <- read_tsv(file_path_txt, skip = 2)
  df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
  merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
  merged_df$X2 = merged_df$X2 / 1000000
  
  merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}

final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]


p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) + 
  geom_point(shape=1) + 
  geom_abline(slope=1, intercept=0,colour='#E41A1C') +
  scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
  scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
  theme_minimal()

final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100

final_df = final_df %>%
  group_by(KNOWN) %>%
  mutate(Median_Percent_Error = median(Percent_Error))

p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) + 
  geom_point() + 
  geom_line() + 
  scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
  scale_y_continuous(name="Median Percent Error", limits=c(0, 150)) +
  theme_minimal()

p1 + p2

This graph, while much less neat, shows a much more accurate representation of what happens after Census-seq’s quality control.

Ten Million Reads In Silico versus Inferred

Code

files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_ten", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)

merged_dfs <- list()

for (dir in dirs) {
  # Construct the file paths
  file_path_txt <- file.path(dir, "my.census.txt")
  file_path_tsv <- file.path(dir, "census_report.tsv")
  df_txt <- read_tsv(file_path_txt, skip = 2)
  df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
  merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
  merged_df$X2 = merged_df$X2 / 10000000
  
  merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}

final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]


p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) + 
  geom_point(shape=1) + 
  geom_abline(slope=1, intercept=0,colour='#E41A1C') +
  scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
  scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
  theme_minimal()


final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100

final_df_ten = final_df %>%
  group_by(X2) %>%
  mutate(Median_Percent_Error = median(Percent_Error))

p2 <- ggplot(final_df_ten, aes(x = X2, y = Median_Percent_Error)) + 
  geom_point() + 
  geom_line() + 
  scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
  scale_y_continuous(name="Median Percent Error") +
  theme_minimal()

p1 + p2

Ten Million Reads Passed Reads versus Inferred

Code

files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_ten", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)

merged_dfs <- list()

for (dir in dirs) {
  # Construct the file paths
  file_path_txt <- file.path(dir, "my.census.txt")
  file_path_tsv <- file.path(dir, "census_report.tsv")
  df_txt <- read_tsv(file_path_txt, skip = 2)
  df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
  merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
  merged_df$X2 = merged_df$X2 / 10000000
  
  merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}

final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]


p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) + 
  geom_point(shape=1) + 
  geom_abline(slope=1, intercept=0,colour='#E41A1C') +
  scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
  scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
  theme_minimal()

final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100

final_df = final_df %>%
  group_by(KNOWN) %>%
  mutate(Median_Percent_Error = median(Percent_Error))

p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) + 
  geom_point() + 
  geom_line() + 
  scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
  scale_y_continuous(name="Median Percent Error") +
  theme_minimal()

p1 + p2

One Hundred Million Reads In Silico versus Inferred

Code

files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_hundred", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)

merged_dfs <- list()

for (dir in dirs) {
  # Construct the file paths
  file_path_txt <- file.path(dir, "my.census.txt")
  file_path_tsv <- file.path(dir, "census_report.tsv")
  df_txt <- read_tsv(file_path_txt, skip = 2)
  df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
  merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
  merged_df$X2 = merged_df$X2 / 100000000
  
  merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}

final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]


p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) + 
  geom_point(shape=1) + 
  geom_abline(slope=1, intercept=0,colour='#E41A1C') +
  scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
  scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
  theme_minimal()


final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100

final_df_hundred = final_df %>%
  group_by(X2) %>%
  mutate(Median_Percent_Error = median(Percent_Error))

p2 <- ggplot(final_df_hundred, aes(x = X2, y = Median_Percent_Error)) + 
  geom_point() + 
  geom_line() + 
  scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
  scale_y_continuous(name="Median Percent Error") +
  theme_minimal()

p1 + p2

One Hundred Million Reads Passed Reads versus Inferred

Code

files <- list.files(path = "/flashscratch/munger-rea/tiny_prop_hundred", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)

merged_dfs <- list()

for (dir in dirs) {
  # Construct the file paths
  file_path_txt <- file.path(dir, "my.census.txt")
  file_path_tsv <- file.path(dir, "census_report.tsv")
  df_txt <- read_tsv(file_path_txt, skip = 2)
  df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
  merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
  merged_df$X2 = merged_df$X2 / 100000000
  
  merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}

final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]


p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) + 
  geom_point(shape=1) + 
  geom_abline(slope=1, intercept=0,colour='#E41A1C') +
  scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
  scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
  theme_minimal()

final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100

final_df = final_df %>%
  group_by(KNOWN) %>%
  mutate(Median_Percent_Error = median(Percent_Error))

p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) + 
  geom_point() + 
  geom_line() + 
  scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
  scale_y_continuous(name="Median Percent Error") +
  theme_minimal()

p1 + p2

Coverage Graphs

In order to determine how accurate Census-seq is with different genome coverages, we tested it with 0.1x, 0.2x, 0.3x, 0.5x, 1x, 2x, 3x, 5x, 10x, 20x, and 30x coverage. The mouse genome (GRCm39) has 2,728,222,451 base pairs. These paired end samples have 150 base pair reads, making a 300 base pair effective read length. Multiplying the genome size by the required coverage and dividing this by the effective read length gives the number of reads needed for each coverage amount. We multiplied this number by 1.10, to compensate for reads that would not pass quality control. The corresponding read number for each genome coverage are below:

0.1x coverage : 1,000,348 reads

0.2x coverage : 2,000,696 reads

0.3x coverage : 3,001,044 reads

0.5x coverage : 5,001,744 reads

1x coverage : 10,003,483 reads

2x coverage : 20,006,965 reads

3x coverage : 30,010,448 reads

5x coverage : 50,017,413 reads

10x coverage : 100,034,825 reads

20x coverage : 200,069,650 reads

30x coverage : 300,104,475 reads

Code

coverage_graphs = function(paths,coverage){

files <- list.files(path = paths, pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)

merged_dfs <- list()

for (dir in dirs) {
  # Construct the file paths
  file_path_txt <- file.path(dir, "my.census.txt")
  file_path_tsv <- file.path(dir, "census_report_final.tsv")
  df_txt <- read_tsv(file_path_txt, skip = 2)
  df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
  merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
  
  merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}

final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X3 > 0.0105),]


final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X3) / final_df$X3 * 100

final_df_c = final_df %>%
  group_by(X3) %>%
  mutate(Median_Percent_Error = median(Percent_Error))

final_df_c$Coverage = coverage
final_df_c = subset(final_df_c, select = c(DONOR,X3,Median_Percent_Error,Coverage))

return(final_df_c)
}

2D Plot

Code

df_0.1 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.1","0.1x")
df_0.2 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.2","0.2x")
df_0.3 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.3","0.3x")
df_0.5 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.5","0.5x")
df_1 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/1","1x")
df_2 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/2","2x")
df_3 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/3","3x")
df_5 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/5","5x")
df_10 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/10","10x")
df_20 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/20","20x")
df_30 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/30","30x")


mega_merge <- rbind(df_0.1,df_0.2,df_0.3,df_0.5,df_1,df_2,df_3,df_5,df_10,df_20,df_30)

coverage_levels <- c("0.1x", "0.2x", "0.3x","0.5x","1x","2x","3x","5x", "10x", "20x", "30x")
mega_merge$Coverage <- factor(mega_merge$Coverage, levels = coverage_levels)

p2 <- ggplot(mega_merge, aes(x = X3, y = Median_Percent_Error)) + 
  geom_line(aes(color = Coverage)) +
  scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
  scale_y_continuous(name="Median Percent Error") +
  scale_color_manual(values = c("0.1x" = "#ff0000", "0.2x" = "#ff8700", "0.3x" = "#ffd300", "0.5x" = "#deff0a", "1x" = "#a1ff0a", "2x" = "#0aff99", "3x" = "#0aefff", "5x" = "#147df5", "10x" = "#580aff", "30x" = "#be0aff", "20x" = "mediumpurple1")) +
  geom_point() + 
  theme_minimal()

p2

View this same plot in 3D below!

3D Plot

Code

library(plotly)

values = c("0.1x" = "#ff0000", "0.2x" = "#ff8700", "0.3x" = "#ffd300", "0.5x" = "#deff0a", "1x" = "#a1ff0a", "2x" = "#0aff99", "3x" = "#0aefff", "5x" = "#147df5", "10x" = "#580aff", "30x" = "#be0aff", "20x" = "mediumpurple1")
mega_merge$DONOR <- paste0(mega_merge$DONOR, "_", seq.int(nrow(mega_merge)))
mega_merge <- data.frame(mega_merge)
rownames(mega_merge) <- mega_merge$DONOR 
mega_merge <- mega_merge %>% arrange(X3)




# Create the 3D line plot with markers
fig <- plot_ly(mega_merge, x = ~X3, y = ~Median_Percent_Error, z = ~Coverage, type = 'scatter3d', mode = 'lines+markers', color = ~Coverage, colors = values, line = list(width = 8), marker = list(color = "black", size = 4))

fig <- fig %>% layout(scene = list(xaxis = list(title = 'Donor Fraction of village (in silico mixture)'), yaxis = list(title = 'Median Percent Error'), zaxis = list(title = 'Genome Coverage')))

fig

Testing Different Aligners

In order to test Census-seq’s sensitivity to different aligners, we tested four different aligners: bwa-mem, minimap, bowtie2, and STAR. Originally tested with 16 random samples, but we wanted to confirm our results with 48 samples. All samples were run with equal proportions.

16 Samples - Violin Plot

Code

bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)

bwa$STAR = STAR$REPRESENTATION
bwa$minimap = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION

bwa$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$minimap = ((bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = ((bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100

meltedviolin = subset(bwa, select = c(DONOR,bwa,minimap,bowtie2,STAR))
meltedviolin = melt(meltedviolin,id="DONOR")
colnames(meltedviolin) <- c('Donor','Aligner','PercentError')

ggplot(meltedviolin, aes(x=Aligner,y=PercentError,fill=Aligner)) +
  geom_abline(slope=0, intercept=0,colour='#000000') +
  geom_violin() +
  scale_fill_brewer(palette="Dark2") + 
  geom_dotplot(binaxis='y',stackdir = 'center',dotsize=1,color="black") +
  labs(title="16 Samples Aligner Type versus Percent Error",x="Aligner", y = "Percent Error") +
  theme_minimal()

In our initial analysis it appeared that bwa-mem and minimap did the best, while bowtie2 and STAR did poorly. To confirm these clusters, we ran this same data in a heatmap.

16 Samples - Heatmap

Code

bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)

bwa$STAR = STAR$REPRESENTATION
bwa$mini = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION

bwa$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$mini = ((bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = ((bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100

bwa$DONOR = sub("plexWell-", "", bwa$DONOR)
bwa$DONOR = sub("_GT23-023.._.*", "", bwa$DONOR)

meltedviolin = subset(bwa, select = c(DONOR,bwa,bowtie2,mini,STAR))

df = as.data.frame(meltedviolin)
rownames(df) = df$DONOR
df$DONOR = NULL

pheatmap(df)

This heatmap confirms bwa-mem and mini perform similarly, as does bowtie2 and STAR. If you look along the rows, it also confirms that male and female samples are not clustering together, indicating limited or no sex effects. Red represents an overestimate and blue represents an underestimate.

48 Samples - Violin Plot

Code

bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/STAR.census.txt",sep="\t",skip=2,header=TRUE)

bwa$STAR = STAR$REPRESENTATION
bwa$minimap = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION

bwa$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$minimap = ((bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = ((bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100

meltedviolin = subset(bwa, select = c(DONOR,bwa,minimap,bowtie2,STAR))
meltedviolin = melt(meltedviolin,id="DONOR")
colnames(meltedviolin) <- c('Donor','Aligner','PercentError')

ggplot(meltedviolin, aes(x=Aligner,y=PercentError,fill=Aligner)) + 
  geom_abline(slope=0, intercept=0,colour='#000000') +
  geom_violin() +
  scale_fill_brewer(palette="Dark2") + 
  geom_dotplot(binaxis='y',stackdir = 'center',dotsize=1,color="black") +
  labs(title="48 Samples Aligner Type versus Percent Error",x="Aligner", y = "Percent Error") +
  theme_minimal() +
  theme(legend.position="none")

In our 48 samples run, we found similar results.

48 Samples - Heatmap

Code

bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/STAR.census.txt",sep="\t",skip=2,header=TRUE)

bwa$STAR = STAR$REPRESENTATION
bwa$minimap = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION

bwa$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$minimap = ((bwa$minimap - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = ((bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100

bwa$DONOR = sub("plexWell-", "", bwa$DONOR)
bwa$DONOR = sub("_GT23-023.._.*", "", bwa$DONOR)

meltedviolin = subset(bwa, select = c(DONOR,bwa,bowtie2,minimap,STAR))

df = as.data.frame(meltedviolin)
rownames(df) = df$DONOR
df$DONOR = NULL

pheatmap(df,show_colnames = TRUE,fontsize_row = 6)

Again, red represents an overestimate and blue represents an underestimate. Between bowtie2 and STAR, many of the samples were similarly overestimated or underestimated. Bwa-mem and minimap did not have this consistency.