Code
library(ggplot2)
library(MASS)
library(reshape2)
library(reshape)
library(patchwork)
library(ggpubr)
library(plotly)
library(dplyr)
library(readr)
library(crayon)
library(pheatmap)
Welcome to the online companion to Madeline Rea’s 2024 Jackson Lab Summer Student Program (SSP) poster! As a note, if you are viewing this on a phone, the webpage is best rendered in landscape mode (long-ways/hot dog). You can press the “Code” drop downs to see exactly how I got the graphs created here.
library(ggplot2)
library(MASS)
library(reshape2)
library(reshape)
library(patchwork)
library(ggpubr)
library(plotly)
library(dplyr)
library(readr)
library(crayon)
library(pheatmap)
Sixteen samples were randomly chosen from the pool of forty-eight and were proportioned in equal, logarithmic, and “random” proportions. Each bar represents the total reads a sample contributed to the pool of three million reads, with magenta reads failing the quality control. Above the difference in estimation versus true proportion for each VCF and their corresponding percent error is graphed. All samples are ordered by when they were assigned proportions (hence the order of logarithmic being perfect).
= function(quilt_input,geno_input,census_input,x_order,base=FALSE){
compare_vcfs
$V1 = sub("plexWell-", "", census_input$V1)
census_input$V1 = sub("_GT23-023.._.*", "", census_input$V1)
census_input$DONOR = sub("plexWell-", "", quilt_input$DONOR)
quilt_input$DONOR = sub("_GT23-023.._.*", "", quilt_input$DONOR)
quilt_input$DONOR = sub("plexWell-", "", geno_input$DONOR)
geno_input$DONOR = sub("_GT23-023.._.*", "", geno_input$DONOR)
geno_input
= census_input[order(census_input$V1),]
census_input $READS = census_input$V2
quilt_input= subset(quilt_input, select = c(DONOR,READS,COUNT))
meltedbar colnames(meltedbar) <- c('DONOR','Failed','Passed')
= melt(meltedbar,id="DONOR")
meltedbar colnames(meltedbar) <- c('Donor','Filtered','Reads')
<- ggplot(data=meltedbar, aes(x=Donor,y=Reads, fill=Filtered)) +
p1 geom_bar(position="identity",stat="identity") +
scale_fill_manual(values=c("#D41159", "#69C561")) +
scale_x_discrete(limits = x_order) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
panel.background = element_rect(fill='transparent'),
plot.background = element_rect(fill='transparent', color=NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.background = element_rect(fill='transparent'),
legend.box.background = element_rect(fill='transparent')
+
) labs(y = "Total Reads") +
guides(fill=guide_legend(title="Reads QC"))
if (base == FALSE){
<- p1 + theme(axis.text.x=element_blank(),
p1 axis.ticks.x=element_blank(),
axis.title.x =element_blank())
}
$G_REP = geno_input$REPRESENTATION
quilt_input$Genoprobs = quilt_input$G_REP - quilt_input$KNOWN
quilt_input$Quilt = quilt_input$REPRESENTATION - quilt_input$KNOWN
quilt_input= subset(quilt_input, select = c(DONOR,Quilt,Genoprobs))
meltedline = melt(meltedline,id="DONOR")
meltedline colnames(meltedline) <- c('Donor','VCF','Differences')
<- ggplot(data = meltedline) +
p2 geom_hline(yintercept=0,linewidth=0.5)+
geom_line(aes(x = Donor, y = Differences, colour = VCF, group = VCF)) +
geom_point(aes(x = Donor, y = Differences, colour = VCF, group = VCF))+
scale_x_discrete(limits = x_order) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_color_manual(values=c('#063970', '#A69943')) +
theme(
panel.background = element_rect(fill='transparent'),
plot.background = element_rect(fill='transparent', color=NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.background = element_rect(fill='transparent'),
legend.box.background = element_rect(fill='transparent')
+ labs(y = "∆ Proportion")
)
$Genoprobs = ((quilt_input$G_REP - quilt_input$KNOWN) / quilt_input$KNOWN) * 100
quilt_input$Quilt = ((quilt_input$REPRESENTATION - quilt_input$KNOWN) / quilt_input$KNOWN) * 100
quilt_input= subset(quilt_input, select = c(DONOR,Quilt,Genoprobs))
meltedline_error = melt(meltedline_error,id="DONOR")
meltedline_error colnames(meltedline_error) <- c('Donor','VCF','PercentError')
<- ggplot(data = meltedline_error) +
p3 geom_hline(yintercept=0,linewidth=0.5)+
geom_line(aes(x = Donor, y = PercentError, colour = VCF, group = VCF)) +
geom_point(aes(x = Donor, y = PercentError, colour = VCF, group = VCF))+
scale_x_discrete(limits = x_order) +
scale_color_manual(values=c('#063970', '#A69943')) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
theme(
panel.background = element_rect(fill='transparent'),
plot.background = element_rect(fill='transparent', color=NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.background = element_rect(fill='transparent'),
legend.box.background = element_rect(fill='transparent')
+ labs(y = "Percent Error")
)
<- p2 + p3 + p1 + plot_layout(ncol = 1)
stacked_plots
stacked_plots }
= read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_census_report.tsv",header=FALSE,sep="\t")
log_census
$V1 = sub("plexWell-", "", log_census$V1)
log_census$V1 = sub("_GT23-023.._.*", "", log_census$V1)
log_census
= log_census[order(log_census$V2),]
hold = hold$V1 hold
= read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_quilt.census.txt",header=TRUE,sep="\t",skip=2)
equal_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_geno.census.txt",header=TRUE,sep="\t",skip=2)
equal_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_census_report.tsv",header=FALSE,sep="\t")
equal_census
compare_vcfs(equal_quilt,equal_geno,equal_census,hold,base=TRUE)
While the Δ Proportion Graph and Percent Error are nearly identical here, (because of the common denominator in the Percent Error [PE] calculation) this is not the case in the logarithmic and “random” graphs. The Genoprobs VCF notably overestimates sample M35’s contribution to the total pool, but does comparably well to the Quilt VCF otherwise.
= read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_quilt.census.txt",header=TRUE,sep="\t",skip=2)
log_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_geno.census.txt",header=TRUE,sep="\t",skip=2)
log_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_census_report.tsv",header=FALSE,sep="\t")
log_census
compare_vcfs(log_quilt,log_geno,log_census,hold,base=TRUE)
Here, even though both VCFs underestimate the contribution of sample M14 the percent error on this estimation is very low. This is because the denominator of the PE calculation is so large. Likewise, although Census-seq estimated samples M04 and M38 approximately, the PE on these calculations is huge because the denominator of the PE calculation is very small. Again, the Genoprobs VCF overestimates the contribution of sample M35.
= read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_quilt.census.txt",header=TRUE,sep="\t",skip=2)
random_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_geno.census.txt",header=TRUE,sep="\t",skip=2)
random_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_census_report.tsv",header=FALSE,sep="\t")
random_census compare_vcfs(random_quilt,random_geno,random_census,hold,base=TRUE)
The reason “random” is in quotes throughout the poster and throughout this webpage is because if the samples were truly given random proportions, there is no guarantee every sample would be given reads. The python script takes in X amount of reads (in this case three million) and ensures that each sample gets, at minimum, at least 0.5% of the total reads in a pool. It also ensures that any one sample is not over 70% of the entire pool, thus it is not truly random. Census-seq does not always overestimate samples with high proportions (like M14 in the logarithmic example) as seen by sample F19 here.
<- c()
sample_amounts <- c()
read_numbers <- c()
percent_errors <- c()
proportion_types
<- function(df) {
calculate_pe $PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
df<- mean(df$PERCENT_ERROR)
av_percent_error return(av_percent_error)
}
<- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
files
for (file_path in files) {
<- strsplit(file_path, "/")[[1]]
path_parts <- path_parts[length(path_parts) - 4]
sample_amount <- path_parts[length(path_parts) - 3]
proportion_type <- path_parts[length(path_parts) - 2]
read_number
<- read_tsv(file_path, skip = 2)
df <- calculate_pe(df)
percent_error <- c(sample_amounts, sample_amount)
sample_amounts <- c(read_numbers, read_number)
read_numbers <- c(percent_errors, percent_error)
percent_errors <- c(proportion_types, proportion_type)
proportion_types
}
<- as.numeric(percent_errors)
percent_errors
<- list(
reads_to_numbers 'h_thousand' = 100000,
'o_million' = 1000000,
't_million' = 10000000,
'f_million' = 50000000,
'h_million' = 100000000
)
<- sapply(read_numbers, function(x) reads_to_numbers[[x]])
read_numbers
<- list(
samples_to_numbers 's4' = 4,
's8' = 8,
's16' = 16,
's32' = 32,
's48' = 48
)
<- sapply(sample_amounts, function(x) samples_to_numbers[[x]])
sample_amounts
<- data.frame(
df 'Reads' = read_numbers,
'Samples' = sample_amounts,
'Percent_Error' = percent_errors,
'Proportion_Type' = proportion_types
)
<- plot_ly(df, x = ~Samples, y = ~Reads, z = ~Percent_Error, color = ~Proportion_Type, colors = c('#009E73','#56B4E9','#F0E442'))
fig <- fig %>% add_markers(marker = list(opacity = 0.5))
fig <- fig %>% layout(scene = list(xaxis = list(title = 'Number of Samples'),
fig yaxis = list(title = 'Number of Reads'),
zaxis = list(title = 'Percent Error')))
fig
In this three dimensional plot, the number of reads in a pool is plotted against the number of samples in a pool, which is then plotted against the average percent error for each run. Logarithmic proportions tend to have the highest percent error due to the large amount of samples with very small proportions. Use the controls on the top right of the graph to move as needed!
These graphs are not on the poster due to the nature of a static poster. However, I think they illustrate that PE decreases in samples with very small proportions as the number of reads in the pool increases.
<- c()
read_numbers
<- function(df) {
calculate_pe $PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
df<- mean(df$PERCENT_ERROR)
av_percent_error return(av_percent_error)
}
<- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/s48/random/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
files
<- data.frame(Proportion = numeric(), Percent_Error = numeric(), Reads = character())
results
for (file_path in files) {
<- strsplit(file_path, "/")[[1]]
path_parts <- path_parts[length(path_parts) - 2]
read_number <- path_parts[length(path_parts) - 1]
run_number
<- read_tsv(file_path, skip = 2)
df
<- df %>%
df mutate(DONOR = paste0(DONOR, "_", run_number),
Percent_Error = abs(REPRESENTATION - KNOWN) / KNOWN * 100,
Reads = read_number)
<- rbind(results, df)
results
}
<- list(
reads_to_numbers 'h_thousand' = 100000,
'o_million' = 1000000,
't_million' = 10000000,
'f_million' = 50000000,
'h_million' = 100000000
)
$Reads <- sapply(results$Reads, function(x) reads_to_numbers[[x]])
results$Reads <- as.integer(results$Reads)
results
$DONOR <- as.factor(results$DONOR)
results
<- results %>%
fig plot_ly(
x = ~KNOWN,
y = ~Percent_Error,
frame = ~Reads,
text = ~DONOR,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
%>%
) layout(
xaxis = list(title = "Proportion in Pool"),
yaxis = list(title = "Percent Error"))
fig
I start with “Random” proportions here because I think these proportions are most representative of what an actual cell village’s distribution would look like. While some samples don’t have many reads overall, they never fall below 0.5% of the total pool. Meanwhile, some samples take up almost 50% of the pool. Pay special attention to samples with very small proportions, you can watch as they decrease in PE as reads increase.
<- c()
read_numbers
<- function(df) {
calculate_pe $PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
df<- mean(df$PERCENT_ERROR)
av_percent_error return(av_percent_error)
}
<- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/s48/log/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
files
<- data.frame(Proportion = numeric(), Percent_Error = numeric(), Reads = character())
results
for (file_path in files) {
<- strsplit(file_path, "/")[[1]]
path_parts <- path_parts[length(path_parts) - 2]
read_number <- path_parts[length(path_parts) - 1]
run_number
<- read_tsv(file_path, skip = 2)
df
<- df %>%
df mutate(DONOR = paste0(DONOR, "_", run_number),
Percent_Error = abs(REPRESENTATION - KNOWN) / KNOWN * 100,
Reads = read_number)
<- rbind(results, df)
results
}
<- list(
reads_to_numbers 'h_thousand' = 100000,
'o_million' = 1000000,
't_million' = 10000000,
'f_million' = 50000000,
'h_million' = 100000000
)
$Reads <- sapply(results$Reads, function(x) reads_to_numbers[[x]])
results$Reads <- as.integer(results$Reads)
results
$DONOR <- as.factor(results$DONOR)
results
<- results %>%
fig plot_ly(
x = ~KNOWN,
y = ~Percent_Error,
frame = ~Reads,
text = ~DONOR,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
%>%
) layout(
xaxis = list(title = "Proportion in Pool"),
yaxis = list(title = "Percent Error")
) fig
The y-axis of this graph stretches over eight times as far as the “random” distributions. This is because unlike “random” distributions, they do not have the minimum 0.5% rule. Thus, these samples can have PE upwards of 2500%. This begs the question, at what proportion can Census-seq accurately estimate contribution?
To answer that question, we tested very small proportions (0.0005-0.012) with a differing number of reads. Results are very similar with the original paper’s. (Mapping genetic effects on cellular phenotypes with “cell villages” by Jana M. Mitchell et al.) All of the following were run in pools of 48 samples.
<- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_one", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
files <- dirname(files)
dirs
<- list()
merged_dfs
for (dir in dirs) {
# Construct the file paths
<- file.path(dir, "my.census.txt")
file_path_txt <- file.path(dir, "census_report.tsv")
file_path_tsv <- read_tsv(file_path_txt, skip = 2)
df_txt <- read_tsv(file_path_tsv, col_names=FALSE)
df_tsv <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df $X2 = merged_df$X2 / 1000000
merged_df
length(merged_dfs) + 1]] <- merged_df
merged_dfs[[
}
<- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
final_df
<- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) +
p1 geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100
final_df
= final_df %>%
final_df_one group_by(X2) %>%
mutate(Median_Percent_Error = median(Percent_Error))
<- ggplot(final_df_one, aes(x = X2, y = Median_Percent_Error)) +
p2 geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 150)) +
theme_minimal()
+ p2 p1
These reads are the reads submitted into Census-seq, thus this graph does not consider the reads that did not pass. That graph is below.
<- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_one", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
files <- dirname(files)
dirs
<- list()
merged_dfs
for (dir in dirs) {
# Construct the file paths
<- file.path(dir, "my.census.txt")
file_path_txt <- file.path(dir, "census_report.tsv")
file_path_tsv <- read_tsv(file_path_txt, skip = 2)
df_txt <- read_tsv(file_path_tsv, col_names=FALSE)
df_tsv <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df $X2 = merged_df$X2 / 1000000
merged_df
length(merged_dfs) + 1]] <- merged_df
merged_dfs[[
}
<- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
final_df
<- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) +
p1 geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100
final_df
= final_df %>%
final_df group_by(KNOWN) %>%
mutate(Median_Percent_Error = median(Percent_Error))
<- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) +
p2 geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error", limits=c(0, 150)) +
theme_minimal()
+ p2 p1
This graph, while much less neat, shows a much more accurate representation of what happens after Census-seq’s quality control.
<- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_ten", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
files <- dirname(files)
dirs
<- list()
merged_dfs
for (dir in dirs) {
# Construct the file paths
<- file.path(dir, "my.census.txt")
file_path_txt <- file.path(dir, "census_report.tsv")
file_path_tsv <- read_tsv(file_path_txt, skip = 2)
df_txt <- read_tsv(file_path_tsv, col_names=FALSE)
df_tsv <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df $X2 = merged_df$X2 / 10000000
merged_df
length(merged_dfs) + 1]] <- merged_df
merged_dfs[[
}
<- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
final_df
<- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) +
p1 geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100
final_df
= final_df %>%
final_df_ten group_by(X2) %>%
mutate(Median_Percent_Error = median(Percent_Error))
<- ggplot(final_df_ten, aes(x = X2, y = Median_Percent_Error)) +
p2 geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error") +
theme_minimal()
+ p2 p1
<- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_ten", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
files <- dirname(files)
dirs
<- list()
merged_dfs
for (dir in dirs) {
# Construct the file paths
<- file.path(dir, "my.census.txt")
file_path_txt <- file.path(dir, "census_report.tsv")
file_path_tsv <- read_tsv(file_path_txt, skip = 2)
df_txt <- read_tsv(file_path_tsv, col_names=FALSE)
df_tsv <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df $X2 = merged_df$X2 / 10000000
merged_df
length(merged_dfs) + 1]] <- merged_df
merged_dfs[[
}
<- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
final_df
<- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) +
p1 geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100
final_df
= final_df %>%
final_df group_by(KNOWN) %>%
mutate(Median_Percent_Error = median(Percent_Error))
<- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) +
p2 geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error") +
theme_minimal()
+ p2 p1
<- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_hundred", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
files <- dirname(files)
dirs
<- list()
merged_dfs
for (dir in dirs) {
# Construct the file paths
<- file.path(dir, "my.census.txt")
file_path_txt <- file.path(dir, "census_report.tsv")
file_path_tsv <- read_tsv(file_path_txt, skip = 2)
df_txt <- read_tsv(file_path_tsv, col_names=FALSE)
df_tsv <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df $X2 = merged_df$X2 / 100000000
merged_df
length(merged_dfs) + 1]] <- merged_df
merged_dfs[[
}
<- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
final_df
<- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) +
p1 geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100
final_df
= final_df %>%
final_df_hundred group_by(X2) %>%
mutate(Median_Percent_Error = median(Percent_Error))
<- ggplot(final_df_hundred, aes(x = X2, y = Median_Percent_Error)) +
p2 geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error") +
theme_minimal()
+ p2 p1
<- list.files(path = "/flashscratch/munger-rea/tiny_prop_hundred", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
files <- dirname(files)
dirs
<- list()
merged_dfs
for (dir in dirs) {
# Construct the file paths
<- file.path(dir, "my.census.txt")
file_path_txt <- file.path(dir, "census_report.tsv")
file_path_tsv <- read_tsv(file_path_txt, skip = 2)
df_txt <- read_tsv(file_path_tsv, col_names=FALSE)
df_tsv <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df $X2 = merged_df$X2 / 100000000
merged_df
length(merged_dfs) + 1]] <- merged_df
merged_dfs[[
}
<- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]
final_df
<- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) +
p1 geom_point(shape=1) +
geom_abline(slope=1, intercept=0,colour='#E41A1C') +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
theme_minimal()
$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100
final_df
= final_df %>%
final_df group_by(KNOWN) %>%
mutate(Median_Percent_Error = median(Percent_Error))
<- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) +
p2 geom_point() +
geom_line() +
scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error") +
theme_minimal()
+ p2 p1
In order to determine how accurate Census-seq is with different genome coverages, we tested it with 0.1x, 0.2x, 0.3x, 0.5x, 1x, 2x, 3x, 5x, 10x, 20x, and 30x coverage. The mouse genome (GRCm39) has 2,728,222,451 base pairs. These paired end samples have 150 base pair reads, making a 300 base pair effective read length. Multiplying the genome size by the required coverage and dividing this by the effective read length gives the number of reads needed for each coverage amount. We multiplied this number by 1.10, to compensate for reads that would not pass quality control. The corresponding read number for each genome coverage are below:
0.1x coverage : 1,000,348 reads
0.2x coverage : 2,000,696 reads
0.3x coverage : 3,001,044 reads
0.5x coverage : 5,001,744 reads
1x coverage : 10,003,483 reads
2x coverage : 20,006,965 reads
3x coverage : 30,010,448 reads
5x coverage : 50,017,413 reads
10x coverage : 100,034,825 reads
20x coverage : 200,069,650 reads
30x coverage : 300,104,475 reads
= function(paths,coverage){
coverage_graphs
<- list.files(path = paths, pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
files <- dirname(files)
dirs
<- list()
merged_dfs
for (dir in dirs) {
# Construct the file paths
<- file.path(dir, "my.census.txt")
file_path_txt <- file.path(dir, "census_report_final.tsv")
file_path_tsv <- read_tsv(file_path_txt, skip = 2)
df_txt <- read_tsv(file_path_tsv, col_names=FALSE)
df_tsv <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
merged_df
length(merged_dfs) + 1]] <- merged_df
merged_dfs[[
}
<- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X3 > 0.0105),]
final_df
$Percent_Error = abs(final_df$REPRESENTATION - final_df$X3) / final_df$X3 * 100
final_df
= final_df %>%
final_df_c group_by(X3) %>%
mutate(Median_Percent_Error = median(Percent_Error))
$Coverage = coverage
final_df_c= subset(final_df_c, select = c(DONOR,X3,Median_Percent_Error,Coverage))
final_df_c
return(final_df_c)
}
.1 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.1","0.1x")
df_0.2 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.2","0.2x")
df_0.3 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.3","0.3x")
df_0.5 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/0.5","0.5x")
df_0<- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/1","1x")
df_1 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/2","2x")
df_2 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/3","3x")
df_3 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/5","5x")
df_5 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/10","10x")
df_10 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/20","20x")
df_20 <- coverage_graphs("/projects/munger-lab/projects/census_seq_SSP2024/coverage_dir/30","30x")
df_30
<- rbind(df_0.1,df_0.2,df_0.3,df_0.5,df_1,df_2,df_3,df_5,df_10,df_20,df_30)
mega_merge
<- c("0.1x", "0.2x", "0.3x","0.5x","1x","2x","3x","5x", "10x", "20x", "30x")
coverage_levels $Coverage <- factor(mega_merge$Coverage, levels = coverage_levels)
mega_merge
<- ggplot(mega_merge, aes(x = X3, y = Median_Percent_Error)) +
p2 geom_line(aes(color = Coverage)) +
scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
scale_y_continuous(name="Median Percent Error") +
scale_color_manual(values = c("0.1x" = "#ff0000", "0.2x" = "#ff8700", "0.3x" = "#ffd300", "0.5x" = "#deff0a", "1x" = "#a1ff0a", "2x" = "#0aff99", "3x" = "#0aefff", "5x" = "#147df5", "10x" = "#580aff", "30x" = "#be0aff", "20x" = "mediumpurple1")) +
geom_point() +
theme_minimal()
p2
View this same plot in 3D below!
library(plotly)
= c("0.1x" = "#ff0000", "0.2x" = "#ff8700", "0.3x" = "#ffd300", "0.5x" = "#deff0a", "1x" = "#a1ff0a", "2x" = "#0aff99", "3x" = "#0aefff", "5x" = "#147df5", "10x" = "#580aff", "30x" = "#be0aff", "20x" = "mediumpurple1")
values $DONOR <- paste0(mega_merge$DONOR, "_", seq.int(nrow(mega_merge)))
mega_merge<- data.frame(mega_merge)
mega_merge rownames(mega_merge) <- mega_merge$DONOR
<- mega_merge %>% arrange(X3)
mega_merge
# Create the 3D line plot with markers
<- plot_ly(mega_merge, x = ~X3, y = ~Median_Percent_Error, z = ~Coverage, type = 'scatter3d', mode = 'lines+markers', color = ~Coverage, colors = values, line = list(width = 8), marker = list(color = "black", size = 4))
fig
<- fig %>% layout(scene = list(xaxis = list(title = 'Donor Fraction of village (in silico mixture)'), yaxis = list(title = 'Median Percent Error'), zaxis = list(title = 'Genome Coverage')))
fig
fig
In order to test Census-seq’s sensitivity to different aligners, we tested four different aligners: bwa-mem, minimap, bowtie2, and STAR. Originally tested with 16 random samples, but we wanted to confirm our results with 48 samples. All samples were run with equal proportions.
= read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)
STAR
$STAR = STAR$REPRESENTATION
bwa$minimap = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION
bwa
$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$minimap = ((bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = ((bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100
bwa
= subset(bwa, select = c(DONOR,bwa,minimap,bowtie2,STAR))
meltedviolin = melt(meltedviolin,id="DONOR")
meltedviolin colnames(meltedviolin) <- c('Donor','Aligner','PercentError')
ggplot(meltedviolin, aes(x=Aligner,y=PercentError,fill=Aligner)) +
geom_abline(slope=0, intercept=0,colour='#000000') +
geom_violin() +
scale_fill_brewer(palette="Dark2") +
geom_dotplot(binaxis='y',stackdir = 'center',dotsize=1,color="black") +
labs(title="16 Samples Aligner Type versus Percent Error",x="Aligner", y = "Percent Error") +
theme_minimal()
In our initial analysis it appeared that bwa-mem and minimap did the best, while bowtie2 and STAR did poorly. To confirm these clusters, we ran this same data in a heatmap.
= read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)
STAR
$STAR = STAR$REPRESENTATION
bwa$mini = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION
bwa
$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$mini = ((bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = ((bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100
bwa
$DONOR = sub("plexWell-", "", bwa$DONOR)
bwa$DONOR = sub("_GT23-023.._.*", "", bwa$DONOR)
bwa
= subset(bwa, select = c(DONOR,bwa,bowtie2,mini,STAR))
meltedviolin
= as.data.frame(meltedviolin)
df rownames(df) = df$DONOR
$DONOR = NULL
df
pheatmap(df)
This heatmap confirms bwa-mem and mini perform similarly, as does bowtie2 and STAR. If you look along the rows, it also confirms that male and female samples are not clustering together, indicating limited or no sex effects. Red represents an overestimate and blue represents an underestimate.
= read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/mini.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bwa.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/STAR.census.txt",sep="\t",skip=2,header=TRUE)
STAR
$STAR = STAR$REPRESENTATION
bwa$minimap = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION
bwa
$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$minimap = ((bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = ((bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100
bwa
= subset(bwa, select = c(DONOR,bwa,minimap,bowtie2,STAR))
meltedviolin = melt(meltedviolin,id="DONOR")
meltedviolin colnames(meltedviolin) <- c('Donor','Aligner','PercentError')
ggplot(meltedviolin, aes(x=Aligner,y=PercentError,fill=Aligner)) +
geom_abline(slope=0, intercept=0,colour='#000000') +
geom_violin() +
scale_fill_brewer(palette="Dark2") +
geom_dotplot(binaxis='y',stackdir = 'center',dotsize=1,color="black") +
labs(title="48 Samples Aligner Type versus Percent Error",x="Aligner", y = "Percent Error") +
theme_minimal() +
theme(legend.position="none")
In our 48 samples run, we found similar results.
= read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/mini.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/bwa.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test_48/STAR.census.txt",sep="\t",skip=2,header=TRUE)
STAR
$STAR = STAR$REPRESENTATION
bwa$minimap = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION
bwa
$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$minimap = ((bwa$minimap - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = ((bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100
bwa
$DONOR = sub("plexWell-", "", bwa$DONOR)
bwa$DONOR = sub("_GT23-023.._.*", "", bwa$DONOR)
bwa
= subset(bwa, select = c(DONOR,bwa,bowtie2,minimap,STAR))
meltedviolin
= as.data.frame(meltedviolin)
df rownames(df) = df$DONOR
$DONOR = NULL
df
pheatmap(df,show_colnames = TRUE,fontsize_row = 6)
Again, red represents an overestimate and blue represents an underestimate. Between bowtie2 and STAR, many of the samples were similarly overestimated or underestimated. Bwa-mem and minimap did not have this consistency.