Code
library(ggplot2)
library(MASS)
library(reshape2)
library(reshape)
library(patchwork)
library(ggpubr)
library(plotly)
library(dplyr)
library(readr)
library(crayon)
library(pheatmap)Welcome to the online companion to Madeline Rea’s 2024 Jackson Lab Summer Student Program (SSP) poster! As a note, if you are viewing this on a phone, the webpage is best rendered in landscape mode (long-ways/hot dog). You can press the “Code” drop downs to see exactly how I got the graphs created here.
library(ggplot2)
library(MASS)
library(reshape2)
library(reshape)
library(patchwork)
library(ggpubr)
library(plotly)
library(dplyr)
library(readr)
library(crayon)
library(pheatmap)Sixteen samples were randomly chosen from the pool of forty-eight and were proportioned in equal, logarithmic, and “random” proportions. Each bar represents the total reads a sample contributed to the pool of three million reads, with magenta reads failing the quality control. Above the difference in estimation versus true proportion for each VCF and their corresponding percent error is graphed. All samples are ordered by when they were assigned proportions (hence the order of logarithmic being perfect).
compare_vcfs = function(quilt_input,geno_input,census_input,x_order,base=FALSE){
census_input$V1 = sub("plexWell-", "", census_input$V1)
census_input$V1 = sub("_GT23-023.._.*", "", census_input$V1)
quilt_input$DONOR = sub("plexWell-", "", quilt_input$DONOR)
quilt_input$DONOR = sub("_GT23-023.._.*", "", quilt_input$DONOR)
geno_input$DONOR = sub("plexWell-", "", geno_input$DONOR)
geno_input$DONOR = sub("_GT23-023.._.*", "", geno_input$DONOR)
census_input = census_input[order(census_input$V1),]
quilt_input$READS = census_input$V2
meltedbar = subset(quilt_input, select = c(DONOR,READS,COUNT))
colnames(meltedbar) <- c('DONOR','Failed','Passed')
meltedbar = melt(meltedbar,id="DONOR")
colnames(meltedbar) <- c('Donor','Filtered','Reads')
p1 <- ggplot(data=meltedbar, aes(x=Donor,y=Reads, fill=Filtered)) +
geom_bar(position="identity",stat="identity") +
scale_fill_manual(values=c("#D41159", "#69C561")) +
scale_x_discrete(limits = x_order) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
panel.background = element_rect(fill='transparent'),
plot.background = element_rect(fill='transparent', color=NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.background = element_rect(fill='transparent'),
legend.box.background = element_rect(fill='transparent')
) +
labs(y = "Total Reads") +
guides(fill=guide_legend(title="Reads QC"))
if (base == FALSE){
p1 <- p1 + theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.x =element_blank())
}
quilt_input$G_REP = geno_input$REPRESENTATION
quilt_input$Genoprobs = quilt_input$G_REP - quilt_input$KNOWN
quilt_input$Quilt = quilt_input$REPRESENTATION - quilt_input$KNOWN
meltedline = subset(quilt_input, select = c(DONOR,Quilt,Genoprobs))
meltedline = melt(meltedline,id="DONOR")
colnames(meltedline) <- c('Donor','VCF','Differences')
p2 <- ggplot(data = meltedline) +
geom_hline(yintercept=0,linewidth=0.5)+
geom_line(aes(x = Donor, y = Differences, colour = VCF, group = VCF)) +
geom_point(aes(x = Donor, y = Differences, colour = VCF, group = VCF))+
scale_x_discrete(limits = x_order) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_color_manual(values=c('#063970', '#A69943')) +
theme(
panel.background = element_rect(fill='transparent'),
plot.background = element_rect(fill='transparent', color=NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.background = element_rect(fill='transparent'),
legend.box.background = element_rect(fill='transparent')
) + labs(y = "∆ Proportion")
quilt_input$Genoprobs = ((quilt_input$G_REP - quilt_input$KNOWN) / quilt_input$KNOWN) * 100
quilt_input$Quilt = ((quilt_input$REPRESENTATION - quilt_input$KNOWN) / quilt_input$KNOWN) * 100
meltedline_error = subset(quilt_input, select = c(DONOR,Quilt,Genoprobs))
meltedline_error = melt(meltedline_error,id="DONOR")
colnames(meltedline_error) <- c('Donor','VCF','PercentError')
p3 <- ggplot(data = meltedline_error) +
geom_hline(yintercept=0,linewidth=0.5)+
geom_line(aes(x = Donor, y = PercentError, colour = VCF, group = VCF)) +
geom_point(aes(x = Donor, y = PercentError, colour = VCF, group = VCF))+
scale_x_discrete(limits = x_order) +
scale_color_manual(values=c('#063970', '#A69943')) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
theme(
panel.background = element_rect(fill='transparent'),
plot.background = element_rect(fill='transparent', color=NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.background = element_rect(fill='transparent'),
legend.box.background = element_rect(fill='transparent')
) + labs(y = "Percent Error")
stacked_plots <- p2 + p3 + p1 + plot_layout(ncol = 1)
stacked_plots
}log_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_census_report.tsv",header=FALSE,sep="\t")
log_census$V1 = sub("plexWell-", "", log_census$V1)
log_census$V1 = sub("_GT23-023.._.*", "", log_census$V1)
hold = log_census[order(log_census$V2),]
hold = hold$V1equal_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_quilt.census.txt",header=TRUE,sep="\t",skip=2)
equal_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_geno.census.txt",header=TRUE,sep="\t",skip=2)
equal_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_census_report.tsv",header=FALSE,sep="\t")
compare_vcfs(equal_quilt,equal_geno,equal_census,hold,base=TRUE)While the Δ Proportion Graph and Percent Error are nearly identical here, (because of the common denominator in the Percent Error [PE] calculation) this is not the case in the logarithmic and “random” graphs. The Genoprobs VCF notably overestimates sample M35’s contribution to the total pool, but does comparably well to the Quilt VCF otherwise.
log_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_quilt.census.txt",header=TRUE,sep="\t",skip=2)
log_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_geno.census.txt",header=TRUE,sep="\t",skip=2)
log_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_census_report.tsv",header=FALSE,sep="\t")
compare_vcfs(log_quilt,log_geno,log_census,hold,base=TRUE)Here, even though both VCFs underestimate the contribution of sample M14 the percent error on this estimation is very low. This is because the denominator of the PE calculation is so large. Likewise, although Census-seq estimated samples M04 and M38 approximately, the PE on these calculations is huge because the denominator of the PE calculation is very small. Again, the Genoprobs VCF overestimates the contribution of sample M35.
random_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_quilt.census.txt",header=TRUE,sep="\t",skip=2)
random_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_geno.census.txt",header=TRUE,sep="\t",skip=2)
random_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_census_report.tsv",header=FALSE,sep="\t")
compare_vcfs(random_quilt,random_geno,random_census,hold,base=TRUE)The reason “random” is in quotes throughout the poster and throughout this webpage is because if the samples were truly given random proportions, there is no guarantee every sample would be given reads. The python script takes in X amount of reads (in this case three million) and ensures that each sample gets, at minimum, at least 0.5% of the total reads in a pool. It also ensures that any one sample is not over 70% of the entire pool, thus it is not truly random. Census-seq does not always overestimate samples with high proportions (like M14 in the logarithmic example) as seen by sample F19 here.
sample_amounts <- c()
read_numbers <- c()
percent_errors <- c()
proportion_types <- c()
calculate_pe <- function(df) {
df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
av_percent_error <- mean(df$PERCENT_ERROR)
return(av_percent_error)
}
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
for (file_path in files) {
path_parts <- strsplit(file_path, "/")[[1]]
sample_amount <- path_parts[length(path_parts) - 4]
proportion_type <- path_parts[length(path_parts) - 3]
read_number <- path_parts[length(path_parts) - 2]
df <- read_tsv(file_path, skip = 2)
percent_error <- calculate_pe(df)
sample_amounts <- c(sample_amounts, sample_amount)
read_numbers <- c(read_numbers, read_number)
percent_errors <- c(percent_errors, percent_error)
proportion_types <- c(proportion_types, proportion_type)
}
percent_errors <- as.numeric(percent_errors)
reads_to_numbers <- list(
'h_thousand' = 100000,
'o_million' = 1000000,
't_million' = 10000000,
'f_million' = 50000000,
'h_million' = 100000000
)
read_numbers <- sapply(read_numbers, function(x) reads_to_numbers[[x]])
samples_to_numbers <- list(
's4' = 4,
's8' = 8,
's16' = 16,
's32' = 32,
's48' = 48
)
sample_amounts <- sapply(sample_amounts, function(x) samples_to_numbers[[x]])
df <- data.frame(
'Reads' = read_numbers,
'Samples' = sample_amounts,
'Percent_Error' = percent_errors,
'Proportion_Type' = proportion_types
)
fig <- plot_ly(df, x = ~Samples, y = ~Reads, z = ~Percent_Error, color = ~Proportion_Type, colors = c('#009E73','#56B4E9','#F0E442'))
fig <- fig %>% add_markers(marker = list(opacity = 0.5))
fig <- fig %>% layout(scene = list(xaxis = list(title = 'Number of Samples'),
yaxis = list(title = 'Number of Reads'),
zaxis = list(title = 'Percent Error')))
figIn this three dimensional plot, the number of reads in a pool is plotted against the number of samples in a pool, which is then plotted against the average percent error for each run. Logarithmic proportions tend to have the highest percent error due to the large amount of samples with very small proportions. Use the controls on the top right of the graph to move as needed!
These graphs are not on the poster due to the nature of a static poster. However, I think they illustrate that PE decreases in samples with very small proportions as the number of reads in the pool increases.
read_numbers <- c()
calculate_pe <- function(df) {
df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
av_percent_error <- mean(df$PERCENT_ERROR)
return(av_percent_error)
}
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/s48/random/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
results <- data.frame(Proportion = numeric(), Percent_Error = numeric(), Reads = character())
for (file_path in files) {
path_parts <- strsplit(file_path, "/")[[1]]
read_number <- path_parts[length(path_parts) - 2]
run_number <- path_parts[length(path_parts) - 1]
df <- read_tsv(file_path, skip = 2)
df <- df %>%
mutate(DONOR = paste0(DONOR, "_", run_number),
Percent_Error = abs(REPRESENTATION - KNOWN) / KNOWN * 100,
Reads = read_number)
results <- rbind(results, df)
}
reads_to_numbers <- list(
'h_thousand' = 100000,
'o_million' = 1000000,
't_million' = 10000000,
'f_million' = 50000000,
'h_million' = 100000000
)
results$Reads <- sapply(results$Reads, function(x) reads_to_numbers[[x]])
results$Reads <- as.integer(results$Reads)
results$DONOR <- as.factor(results$DONOR)
fig <- results %>%
plot_ly(
x = ~KNOWN,
y = ~Percent_Error,
frame = ~Reads,
text = ~DONOR,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
) %>%
layout(
xaxis = list(title = "Proportion in Pool"),
yaxis = list(title = "Percent Error"))
figI start with “Random” proportions here because I think these proportions are most representative of what an actual cell village’s distribution would look like. While some samples don’t have many reads overall, they never fall below 0.5% of the total pool. Meanwhile, some samples take up almost 50% of the pool. Pay special attention to samples with very small proportions, you can watch as they decrease in PE as reads increase.
read_numbers <- c()
calculate_pe <- function(df) {
df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
av_percent_error <- mean(df$PERCENT_ERROR)
return(av_percent_error)
}
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/s48/log/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
results <- data.frame(Proportion = numeric(), Percent_Error = numeric(), Reads = character())
for (file_path in files) {
path_parts <- strsplit(file_path, "/")[[1]]
read_number <- path_parts[length(path_parts) - 2]
run_number <- path_parts[length(path_parts) - 1]
df <- read_tsv(file_path, skip = 2)
df <- df %>%
mutate(DONOR = paste0(DONOR, "_", run_number),
Percent_Error = abs(REPRESENTATION - KNOWN) / KNOWN * 100,
Reads = read_number)
results <- rbind(results, df)
}
reads_to_numbers <- list(
'h_thousand' = 100000,
'o_million' = 1000000,
't_million' = 10000000,
'f_million' = 50000000,
'h_million' = 100000000
)
results$Reads <- sapply(results$Reads, function(x) reads_to_numbers[[x]])
results$Reads <- as.integer(results$Reads)
results$DONOR <- as.factor(results$DONOR)
fig <- results %>%
plot_ly(
x = ~KNOWN,
y = ~Percent_Error,
frame = ~Reads,
text = ~DONOR,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
) %>%
layout(
xaxis = list(title = "Proportion in Pool"),
yaxis = list(title = "Percent Error")
)
figThe y-axis of this graph stretches over eight times as far as the “random” distributions. This is because unlike “random” distributions, they do not have the minimum 0.5% rule. Thus, these samples can have PE upwards of 2500%. This begs the question, at what proportion can Census-seq accurately estimate contribution?
To answer that question, we tested very small proportions (0.0005-0.012) with a differing number of reads. Results are very similar with the original paper’s. (Mapping genetic effects on cellular phenotypes with “cell villages” by Jana M. Mitchell et al.) All of the following were run in pools of 48 samples.
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_one", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 1000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100
final_df_one = final_df %>%
group_by(X2) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df_one, aes(x = X2, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 150)) +
theme_minimal()
p1 + p2These reads are the reads submitted into Census-seq, thus this graph does not consider the reads that did not pass. That graph is below.
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_one", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 1000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100
final_df = final_df %>%
group_by(KNOWN) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 150)) +
theme_minimal()
p1 + p2This graph, while much less neat, shows a much more accurate representation of what happens after Census-seq’s quality control.
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_ten", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 10000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100
final_df_ten = final_df %>%
group_by(X2) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df_ten, aes(x = X2, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error") +
theme_minimal()
p1 + p2files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_ten", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 10000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100
final_df = final_df %>%
group_by(KNOWN) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error") +
theme_minimal()
p1 + p2files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_hundred", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 100000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100
final_df_hundred = final_df %>%
group_by(X2) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df_hundred, aes(x = X2, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error") +
theme_minimal()
p1 + p2files <- list.files(path = "/flashscratch/munger-rea/tiny_prop_hundred", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df$X2 = merged_df$X2 / 100000000
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) +
geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100
final_df = final_df %>%
group_by(KNOWN) %>%
mutate(Median_Percent_Error = median(Percent_Error))
p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) +
geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error") +
theme_minimal()
p1 + p2In order to determine how accurate Census-seq is with different genome coverages, we tested it with 0.1x, 0.2x, 0.3x, 0.5x, 1x, 2x, 3x, 5x, 10x, 20x, and 30x coverage. The mouse genome (GRCm39) has 2,728,222,451 base pairs. These paired end samples have 150 base pair reads, making a 300 base pair effective read length. Multiplying the genome size by the required coverage and dividing this by the effective read length gives the number of reads needed for each coverage amount. We multiplied this number by 1.10, to compensate for reads that would not pass quality control. The corresponding read number for each genome coverage are below:
0.1x coverage : 1,000,348 reads
0.2x coverage : 2,000,696 reads
0.3x coverage : 3,001,044 reads
0.5x coverage : 5,001,744 reads
1x coverage : 10,003,483 reads
2x coverage : 20,006,965 reads
3x coverage : 30,010,448 reads
5x coverage : 50,017,413 reads
10x coverage : 100,034,825 reads
20x coverage : 200,069,650 reads
30x coverage : 300,104,475 reads
coverage_graphs = function(paths,coverage){
files <- list.files(path = paths, pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)
merged_dfs <- list()
for (dir in dirs) {
# Construct the file paths
file_path_txt <- file.path(dir, "my.census.txt")
file_path_tsv <- file.path(dir, "census_report_final.tsv")
df_txt <- read_tsv(file_path_txt, skip = 2)
df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}
final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X3 > 0.0105),]
final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X3) / final_df$X3 * 100
final_df_c = final_df %>%
group_by(X3) %>%
mutate(Median_Percent_Error = median(Percent_Error))
final_df_c$Coverage = coverage
final_df_c = subset(final_df_c, select = c(DONOR,X3,Median_Percent_Error,Coverage))
return(final_df_c)
}df_0.1 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.1","0.1x")
df_0.2 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.2","0.2x")
df_0.3 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.3","0.3x")
df_0.5 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.5","0.5x")
df_1 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/1","1x")
df_2 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/2","2x")
df_3 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/3","3x")
df_5 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/5","5x")
df_10 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/10","10x")
df_20 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/20","20x")
df_30 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/30","30x")
mega_merge <- rbind(df_0.1,df_0.2,df_0.3,df_0.5,df_1,df_2,df_3,df_5,df_10,df_20,df_30)
coverage_levels <- c("0.1x", "0.2x", "0.3x","0.5x","1x","2x","3x","5x", "10x", "20x", "30x")
mega_merge$Coverage <- factor(mega_merge$Coverage, levels = coverage_levels)
p2 <- ggplot(mega_merge, aes(x = X3, y = Median_Percent_Error)) +
geom_line(aes(color = Coverage)) +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error") +
scale_color_manual(values = c("0.1x" = "#ff0000", "0.2x" = "#ff8700", "0.3x" = "#ffd300", "0.5x" = "#deff0a", "1x" = "#a1ff0a", "2x" = "#0aff99", "3x" = "#0aefff", "5x" = "#147df5", "10x" = "#580aff", "30x" = "#be0aff", "20x" = "mediumpurple1")) +
geom_point() +
theme_minimal()
p2View this same plot in 3D below!
library(plotly)
values = c("0.1x" = "#ff0000", "0.2x" = "#ff8700", "0.3x" = "#ffd300", "0.5x" = "#deff0a", "1x" = "#a1ff0a", "2x" = "#0aff99", "3x" = "#0aefff", "5x" = "#147df5", "10x" = "#580aff", "30x" = "#be0aff", "20x" = "mediumpurple1")
mega_merge$DONOR <- paste0(mega_merge$DONOR, "_", seq.int(nrow(mega_merge)))
mega_merge <- data.frame(mega_merge)
rownames(mega_merge) <- mega_merge$DONOR
mega_merge <- mega_merge %>% arrange(X3)
# Create the 3D line plot with markers
fig <- plot_ly(mega_merge, x = ~X3, y = ~Median_Percent_Error, z = ~Coverage, type = 'scatter3d', mode = 'lines+markers', color = ~Coverage, colors = values, line = list(width = 8), marker = list(color = "black", size = 4))
fig <- fig %>% layout(scene = list(xaxis = list(title = 'Donor Fraction of village (in silico mixture)'), yaxis = list(title = 'Median Percent Error'), zaxis = list(title = 'Genome Coverage')))
figIn order to test Census-seq’s sensitivity to different aligners, we tested four different aligners: bwa-mem, minimap, bowtie2, and STAR. Originally tested with 16 random samples, but we wanted to confirm our results with 48 samples. All samples were run with equal proportions.
bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)
bwa$STAR = STAR$REPRESENTATION
bwa$minimap = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION
bwa$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$minimap = ((bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = ((bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100
meltedviolin = subset(bwa, select = c(DONOR,bwa,minimap,bowtie2,STAR))
meltedviolin = melt(meltedviolin,id="DONOR")
colnames(meltedviolin) <- c('Donor','Aligner','PercentError')
ggplot(meltedviolin, aes(x=Aligner,y=PercentError,fill=Aligner)) +
geom_abline(slope=0, intercept=0,colour='#000000') +
geom_violin() +
scale_fill_brewer(palette="Dark2") +
geom_dotplot(binaxis='y',stackdir = 'center',dotsize=1,color="black") +
labs(title="16 Samples Aligner Type versus Percent Error",x="Aligner", y = "Percent Error") +
theme_minimal() In our initial analysis it appeared that bwa-mem and minimap did the best, while bowtie2 and STAR did poorly. To confirm these clusters, we ran this same data in a heatmap.
bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)
bwa$STAR = STAR$REPRESENTATION
bwa$mini = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION
bwa$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$mini = ((bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = ((bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$DONOR = sub("plexWell-", "", bwa$DONOR)
bwa$DONOR = sub("_GT23-023.._.*", "", bwa$DONOR)
meltedviolin = subset(bwa, select = c(DONOR,bwa,bowtie2,mini,STAR))
df = as.data.frame(meltedviolin)
rownames(df) = df$DONOR
df$DONOR = NULL
pheatmap(df)This heatmap confirms bwa-mem and mini perform similarly, as does bowtie2 and STAR. If you look along the rows, it also confirms that male and female samples are not clustering together, indicating limited or no sex effects. Red represents an overestimate and blue represents an underestimate.
bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/STAR.census.txt",sep="\t",skip=2,header=TRUE)
bwa$STAR = STAR$REPRESENTATION
bwa$minimap = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION
bwa$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$minimap = ((bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = ((bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100
meltedviolin = subset(bwa, select = c(DONOR,bwa,minimap,bowtie2,STAR))
meltedviolin = melt(meltedviolin,id="DONOR")
colnames(meltedviolin) <- c('Donor','Aligner','PercentError')
ggplot(meltedviolin, aes(x=Aligner,y=PercentError,fill=Aligner)) +
geom_abline(slope=0, intercept=0,colour='#000000') +
geom_violin() +
scale_fill_brewer(palette="Dark2") +
geom_dotplot(binaxis='y',stackdir = 'center',dotsize=1,color="black") +
labs(title="48 Samples Aligner Type versus Percent Error",x="Aligner", y = "Percent Error") +
theme_minimal() +
theme(legend.position="none")In our 48 samples run, we found similar results.
bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/STAR.census.txt",sep="\t",skip=2,header=TRUE)
bwa$STAR = STAR$REPRESENTATION
bwa$minimap = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION
bwa$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$minimap = ((bwa$minimap - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = ((bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$DONOR = sub("plexWell-", "", bwa$DONOR)
bwa$DONOR = sub("_GT23-023.._.*", "", bwa$DONOR)
meltedviolin = subset(bwa, select = c(DONOR,bwa,bowtie2,minimap,STAR))
df = as.data.frame(meltedviolin)
rownames(df) = df$DONOR
df$DONOR = NULL
pheatmap(df,show_colnames = TRUE,fontsize_row = 6)Again, red represents an overestimate and blue represents an underestimate. Between bowtie2 and STAR, many of the samples were similarly overestimated or underestimated. Bwa-mem and minimap did not have this consistency.