library(Signac)
library(Seurat)
library(ComplexHeatmap)
library(GenomeInfoDb)
library(EnsDb.Hsapiens.v75)
library(BuenColors)
"%ni%" <- Negate("%in%")
This vignette provides accessory code for inferring subclonal variants and performing integrative analyses of mtscATAC-seq data utilizing the output of mgatk
. This workflow is most similar to what was done in the mtscATAC-seq paper. See the Find mtDNA variants
section below to rapidly identify subclonal variants from the full enumeration of possibilities that is produced from mgatk by default.
For additional use cases of how to infer and utilize subclonal variants, check out the GitHub repository that contains all analyses from the mtscATAC-seq paper: https://github.com/caleblareau/mtscATACpaper_reproducibility.
Note: working with Tim Stuart, much of the functionality shown here has now been incorporated into the popular Signac/Seurat R package workflow. Check out the mitochondrial vignettes at the Signac homepage: https://satijalab.org/signac/
Here, we profiled a modest number (aimed for ~2,500) cells from a patient with a colorectal cancer (CRC) tumor. The tumor contains both malignant epithelial cells as well as infiltrating immune populations. This is a simple example that should serve as a basis for using mtscATAC-seq data to infer cell states based on chromatin data and clonal populations based on mtDNA sequencing.
Here is basic filtering of barcodes after the 10x knee-call– nothing too inspired.
There a population of myeloid cells that apparently didn’t yield very good accessible chromatin (which we will explore more below), so we are setting an inclusive filter to retain these cells.
counts <- Read10X_h5(filename = "../data/CRC-CellRanger/outs/CRC_v12-mtMask_mgatk.filtered_peak_bc_matrix.h5")
metadata <- read.csv(
file = "../data/CRC-CellRanger/outs/CRC_v12-mtMask_mgatk.singlecell.csv",
header = TRUE,
row.names = 1
)
crc <- CreateSeuratObject(
counts = counts,
assay = 'peaks',
project = 'ATAC',
min.cells = 1,
meta.data = metadata
)
fragment.path <- '../data/CRC-CellRanger/outs/CRC_v12-mtMask_mgatk.fragments.tsv.gz'
crc <- SetFragments(
object = crc,
file = fragment.path
)
# Augment QC
crc$pct_reads_in_peaks <- crc$peak_region_fragments / crc$passed_filters * 100
crc$pct_reads_in_DNase <- crc$DNase_sensitive_region_fragments / crc$passed_filters * 100
# There will be a population of high fragment, low frip cells that we will choose to keep to examine downstream
qplot(log10(crc@meta.data$passed_filters), crc@meta.data$pct_reads_in_DNase) +
pretty_plot() + L_border() + labs(x = "# Unique Fragments", y = "% Reads in Peaks") +
geom_vline(xintercept = 3, color = "firebrick") +
geom_hline(yintercept = 40, color = "firebrick")
crc_filt <- subset(crc, subset = passed_filters >= 1000 & pct_reads_in_DNase >= 40)
crc_filt
## An object of class Seurat
## 87269 features across 2003 samples within 1 assay
## Active assay: peaks (87269 features, 0 variable features)
Typically, I would filter cells for a minimum 20x coverage (esp. for hematopoetic cells), but for this tumor setting, a 10x filter had to be used because of the low-ish copy number. I think that this mostly has to do with needing to optimize lysis conditions in solid tissues.
mgatk_se <- readRDS("../data/CRC_mgatk.rds")
# Subset to HQ cells that exist so far
mgatk_se <- mgatk_se[,colnames(crc_filt)]
# Threshold based on abundance of mtDNA
crc_filt$mtDNA_depth <- mgatk_se$depth
# Could probably replace this with a violin plot
qplot(crc_filt$mtDNA_depth) + scale_x_log10(breaks = c(1,10,100)) +
geom_vline(xintercept = 10, color = "firebrick") +
pretty_plot() + L_border() + labs(x = "mean mtDNA depth/bp", y = "count")
# Final filter -- mgatk, Signac, and fragments all
crc_filt2 <- subset(crc_filt, mtDNA_depth >=10)
filtered.fragment.path <- "../outs/CRC-filtered.fragments.tsv"
mgatk_se <- mgatk_se[,colnames(crc_filt2)]
if(!file.exists(paste0(filtered.fragment.path, ".bgz"))){
FilterFragments(
fragment.path = fragment.path,
cells = colnames(crc_filt2),
output.path = filtered.fragment.path
)
}
Having now filtered out cells with decent chromatin and mitochondrial data, let’s visualize the ATAC data using standard Signac workflows…
# Rename for convenience
crc <- crc_filt2
# Run semi-standard Signac dimension reduction, clustering, etc.
crc <- RunTFIDF(crc)
crc <- FindTopFeatures(crc, min.cutoff = 'q50')
crc <- RunSVD(
object = crc,
assay = 'peaks',
reduction.key = 'LSI_',
reduction.name = 'lsi',
irlba.work=500
)
crc <- RunUMAP(object = crc, reduction = 'lsi', dims = 2:30)
crc <- FindNeighbors(object = crc, reduction = 'lsi', dims = 2:30)
crc <- FindClusters(object = crc, verbose = FALSE, algorithm = 3, resolution = 0.5)
DimPlot(object = crc, label = TRUE) + NoLegend()
Here, we can see that some clusters have less mitochondrial DNA content as well as worse reads in DNase peaks than others. Let’s define the cell states using marker genes to better understand this…
ga_rds <- "../outs/CRC-geneActivites.rds"
if(!file.exists(ga_rds)){
# Pull gene coordinates
gene.coords <- genes(EnsDb.Hsapiens.v75, filter = ~ gene_biotype == "protein_coding")
seqlevelsStyle(gene.coords) <- 'UCSC'
genebody.coords <- keepStandardChromosomes(gene.coords, pruning.mode = 'coarse')
genebodyandpromoter.coords <- Extend(x = gene.coords, upstream = 2000, downstream = 0)
# create a gene by cell matrix of scores
gene.activities <- FeatureMatrix(
fragments = paste0(filtered.fragment.path, ".bgz"),
features = genebodyandpromoter.coords,
cells = colnames(crc),
chunk = 20
)
# convert rownames from chromsomal coordinates into gene names
gene.key <- genebodyandpromoter.coords$gene_name
names(gene.key) <- GRangesToString(grange = genebodyandpromoter.coords)
rownames(gene.activities) <- gene.key[rownames(gene.activities)]
saveRDS(gene.activities, ga_rds)
} else{
gene.activities <- readRDS(ga_rds)
}
# Put into Seurat object
crc[['RNA']] <- CreateAssayObject(counts = gene.activities)
crc <- NormalizeData(
object = crc,
assay = 'RNA',
normalization.method = 'LogNormalize',
scale.factor = median(crc$nCount_RNA)
)
DefaultAssay(crc) <- 'RNA'
FeaturePlot(
object = crc,
features = c('TREM1', 'EPCAM', "PTPRC", "IL1RL1","GATA3", "KIT"),
pt.size = 0.1,
max.cutoff = 'q95',
ncol = 2
)
Using these gene score values, we can assign cluster identities:
0 - epithelial cells
1 - myeloid_1
2 - basophils
3 - tcell
4 - myeloid_2
5 - mast cells
Just to show why we were more inclusive during the cell filtering stage:
crc$log10_mtDNA_depth <- log10(crc$mtDNA_depth)
FeaturePlot(crc, features = c("pct_reads_in_DNase"))
FeaturePlot(crc, features = c("log10_mtDNA_depth"))
Some other views of the overall QC now that annotations have been established:
ggplot(crc@meta.data, aes(x = log10(passed_filters), y = pct_reads_in_DNase, color = seurat_clusters)) +
geom_point() + pretty_plot() + L_border() + labs(x = "# Unique Fragments", y = "% Reads in Peaks")
ggplot(crc@meta.data, aes(x = log10_mtDNA_depth, y = pct_reads_in_DNase, color = seurat_clusters)) +
geom_point() + pretty_plot() + L_border() + labs(x = "mean mtDNA coverage", y = "% Reads in Peaks")
We can see that most of the low FRIP cells were the myeloid 1
cluster. This is some sort of intra-tumor granulocyte that has relatively poor accessible chromatin enrichment. Due to the unusual chromatin patterns in these cells, this cluster also has relatively low mtDNA coverage.
All of the above analyses are standard things that one can do with Signac. Now, let’s overlay mtDNA variants onto the analysis.
To do this, we call the function in the R script to find HQ mutations.
source("variant_calling.R")
# mgatk_se is the Summarized Experiment .rds file
# That is automatically produced from running
# The mgatk CLI python package
mut_se <- call_mutations_mgatk(mgatk_se)
## [1] "A"
## [1] "C"
## [1] "G"
## [1] "T"
misc_df <- data.frame(rowData(mut_se))
ggplot(misc_df %>% filter(n_cells_conf_detected >= 5), aes(x = strand_correlation, y = log10(vmr))) +
geom_point() +
labs(color = "HQ", x = "Pearson correlation (strand)", y = "log VMR") +
geom_vline(xintercept = 0.65, color = "black", linetype = 2) +
geom_hline(yintercept = -2, color = "black", linetype = 2) +
pretty_plot(fontsize = 12) + L_border() +
theme(legend.position = "bottom")
# Establish a filtered data frame of variants based on this processing
filter_df <- misc_df %>% dplyr::filter(n_cells_conf_detected >= 5 & strand_correlation >= 0.65 & log10(vmr) > -2)
Above, the plot is sort of the standard way to visualize potential variants that could be informative for subclustering. From this execution (which serves as pretty good defaults from my testing), we get 12 variants. Let’s look more closely:
filter_df[,c(1,2,3,5)]
## position nucleotide variant mean
## 1227G>A 1227 G>A 1227G>A 0.0078025
## 3244G>A 3244 G>A 3244G>A 0.0010942
## 6081G>A 6081 G>A 6081G>A 0.0027757
## 9804G>A 9804 G>A 9804G>A 0.0029640
## 12889G>A 12889 G>A 12889G>A 0.0234690
## 824T>C 824 T>C 824T>C 0.0049584
## 2285T>C 2285 T>C 2285T>C 0.0053491
## 9840T>C 9840 T>C 9840T>C 0.0019356
## 12731T>C 12731 T>C 12731T>C 0.0019040
## 16093T>C 16093 T>C 16093T>C 0.0069912
## 9728C>T 9728 C>T 9728C>T 0.0127868
## 16147C>T 16147 C>T 16147C>T 0.6185731
A few things stand out. First, 10 out of the 12 variants occur at less than 1% allele frequency in the population. However, 16147C>T is present at about 62%. We’ll see that this is a clonal variant marking the epithelial cells. Additionally, all of the called variants are transitions. This fits what know about mtDNA mutational frequency.
Currently, this is pretty messy, but I think that visualizing variants on the reduced dimension embedding is generally useful:
library(cowplot)
mito_df <- data.frame(
crc@meta.data,
crc@reductions$umap@cell.embeddings,
t(assays(mut_se)[["allele_frequency"]][as.character(filter_df$variant),]),
t(assays(mut_se)[["coverage"]][as.character(filter_df$variant),])
)
plot_mito_umap <- function(mito_df, variant){
# Ugly name that comes from making it a column names
ugly_var <- paste0("X", gsub(">", ".", variant))
mito_df$variant_value <- mito_df[,ugly_var]*100
ggplot(mito_df %>% arrange(variant_value), aes(x = UMAP_1, y = UMAP_2, color = variant_value)) +
geom_point() + theme_bw() +
scale_color_gradientn(colors = c("lightgrey", "firebrick")) +
labs(color = variant)
}
plot1 <- plot_mito_umap(mito_df, "16147C>T")
plot2 <- plot_mito_umap(mito_df, "9728C>T")
plot3 <- plot_mito_umap(mito_df, "1227G>A")
plot4 <- plot_mito_umap(mito_df, "12889G>A")
plot5 <- plot_mito_umap(mito_df, "3244G>A")
plot6 <- plot_mito_umap(mito_df, "824T>C")
cowplot::plot_grid(plot1, plot2, plot3, plot4, plot5, plot6, nrow = 3)
Here, we can see a few interesting patterns for the selected variants. 16147C>T is present in essentially all epithelial cells and almost exclusively in epithelial cells (the edge cases where this isn’t true are also cases where the UMAP and clustering don’t full agree). It is at 100% allele frequency– strongly suggestive of whatever cell of origin of this tumor had the mutation at 100% and then expanded.
We then see at least 3 variants 9728C>T, 1227G>A, 12889G>A that are mostly present specifically in the epithelial cells that define subclones. It’s not really enough cells to robustly define subclones however.
The variants 824T>C and 3244G>A are found specifically in immune cell populations, suggesting that these arose from a common hematopoetic progenitor cell (probably in the bone marrow).