This dataset

Here, we profiled a modest number (aimed for ~2,500) cells from a patient with a colorectal cancer (CRC) tumor. The tumor contains both malignant epithelial cells as well as infiltrating immune populations. This is a simple example that should serve as a basis for using mtscATAC-seq data to infer cell states based on chromatin data and clonal populations based on mtDNA sequencing.

Identifying high-quality cells/barcodes

Here is basic filtering of barcodes after the 10x knee-call– nothing too inspired.

There a population of myeloid cells that apparently didn’t yield very good accessible chromatin (which we will explore more below), so we are setting an inclusive filter to retain these cells.

counts <- Read10X_h5(filename = "../data/CRC-CellRanger/outs/CRC_v12-mtMask_mgatk.filtered_peak_bc_matrix.h5")
metadata <- read.csv(
file = "../data/CRC-CellRanger/outs/CRC_v12-mtMask_mgatk.singlecell.csv",
header = TRUE,
row.names = 1
)

crc <- CreateSeuratObject(
counts = counts,
assay = 'peaks',
project = 'ATAC',
min.cells = 1,
meta.data = metadata
)

fragment.path <- '../data/CRC-CellRanger/outs/CRC_v12-mtMask_mgatk.fragments.tsv.gz'

crc <- SetFragments(
object = crc,
file = fragment.path
)

# Augment QC
crc$pct_reads_in_peaks <- crc$peak_region_fragments / crc$passed_filters * 100
crc$pct_reads_in_DNase <- crc$DNase_sensitive_region_fragments / crc$passed_filters * 100

# There will be a population of high fragment, low frip cells that we will choose to keep to examine downstream
qplot(log10(crc@meta.data$passed_filters), crc@meta.data$pct_reads_in_DNase) +
pretty_plot() + L_border() + labs(x = "# Unique Fragments", y = "% Reads in Peaks") +
geom_vline(xintercept = 3, color = "firebrick") +
geom_hline(yintercept = 40, color = "firebrick")

crc_filt <- subset(crc, subset = passed_filters >= 1000 & pct_reads_in_DNase >= 40)
crc_filt

## An object of class Seurat 
## 87269 features across 2003 samples within 1 assay 
## Active assay: peaks (87269 features, 0 variable features)

Augment with mtDNA data to further filter barcodes for analysis.

Typically, I would filter cells for a minimum 20x coverage (esp. for hematopoetic cells), but for this tumor setting, a 10x filter had to be used because of the low-ish copy number. I think that this mostly has to do with needing to optimize lysis conditions in solid tissues.

mgatk_se <- readRDS("../data/CRC_mgatk.rds")

# Subset to HQ cells that exist so far
mgatk_se <- mgatk_se[,colnames(crc_filt)]

# Threshold based on abundance of mtDNA
crc_filt$mtDNA_depth <- mgatk_se$depth

# Could probably replace this with a violin plot
qplot(crc_filt$mtDNA_depth) + scale_x_log10(breaks = c(1,10,100)) +
  geom_vline(xintercept = 10, color = "firebrick") +
  pretty_plot() + L_border() + labs(x = "mean mtDNA depth/bp", y = "count")

# Final filter -- mgatk, Signac, and fragments all
crc_filt2 <- subset(crc_filt, mtDNA_depth >=10)
filtered.fragment.path <- "../outs/CRC-filtered.fragments.tsv"
mgatk_se <- mgatk_se[,colnames(crc_filt2)]

if(!file.exists(paste0(filtered.fragment.path, ".bgz"))){
  FilterFragments(
    fragment.path = fragment.path,
    cells = colnames(crc_filt2),
    output.path = filtered.fragment.path
  )
}

Run dimensionality reduction

Having now filtered out cells with decent chromatin and mitochondrial data, let’s visualize the ATAC data using standard Signac workflows…

# Rename for convenience
crc <- crc_filt2

# Run semi-standard Signac dimension reduction, clustering, etc. 
crc <- RunTFIDF(crc)
crc <- FindTopFeatures(crc, min.cutoff = 'q50')
crc <- RunSVD(
  object = crc,
  assay = 'peaks',
  reduction.key = 'LSI_',
  reduction.name = 'lsi',
  irlba.work=500
)

crc <- RunUMAP(object = crc, reduction = 'lsi', dims = 2:30)
crc <- FindNeighbors(object = crc, reduction = 'lsi', dims = 2:30)
crc <- FindClusters(object = crc, verbose = FALSE, algorithm = 3, resolution = 0.5)
DimPlot(object = crc, label = TRUE) + NoLegend()

Generate gene scores

Here, we can see that some clusters have less mitochondrial DNA content as well as worse reads in DNase peaks than others. Let’s define the cell states using marker genes to better understand this…

ga_rds <- "../outs/CRC-geneActivites.rds"
if(!file.exists(ga_rds)){
  
  # Pull gene coordinates 
  gene.coords <- genes(EnsDb.Hsapiens.v75, filter = ~ gene_biotype == "protein_coding")
  seqlevelsStyle(gene.coords) <- 'UCSC'
  genebody.coords <- keepStandardChromosomes(gene.coords, pruning.mode = 'coarse')
  genebodyandpromoter.coords <- Extend(x = gene.coords, upstream = 2000, downstream = 0)
  
  # create a gene by cell matrix of scores
  gene.activities <- FeatureMatrix(
    fragments = paste0(filtered.fragment.path, ".bgz"),
    features = genebodyandpromoter.coords,
    cells = colnames(crc),
    chunk = 20
  )
  
  # convert rownames from chromsomal coordinates into gene names
  gene.key <- genebodyandpromoter.coords$gene_name
  names(gene.key) <- GRangesToString(grange = genebodyandpromoter.coords)
  rownames(gene.activities) <- gene.key[rownames(gene.activities)]
  saveRDS(gene.activities, ga_rds)
} else{
  gene.activities <- readRDS(ga_rds)
}
# Put into Seurat object
crc[['RNA']] <- CreateAssayObject(counts = gene.activities)
crc <- NormalizeData(
  object = crc,
  assay = 'RNA',
  normalization.method = 'LogNormalize',
  scale.factor = median(crc$nCount_RNA)
)

Visualize interesting gene activity scores

EPCAM is a marker for epithelial cells
TREM1 is a meyloid marker
PTPRC = CD45 is a pan-immune cell marker
IL1RL1 are basophil markers
GATA3 is a T-cell maker

DefaultAssay(crc) <- 'RNA'

FeaturePlot(
  object = crc,
  features = c('TREM1', 'EPCAM', "PTPRC", "IL1RL1","GATA3", "KIT"),
  pt.size = 0.1,
  max.cutoff = 'q95',
  ncol = 2
)

Using these gene score values, we can assign cluster identities:

0 - epithelial cells
1 - myeloid_1
2 - basophils
3 - tcell
4 - myeloid_2
5 - mast cells

Look back at QC

Just to show why we were more inclusive during the cell filtering stage:

crc$log10_mtDNA_depth <- log10(crc$mtDNA_depth)
FeaturePlot(crc, features = c("pct_reads_in_DNase"))

FeaturePlot(crc, features = c("log10_mtDNA_depth"))

Some other views of the overall QC now that annotations have been established:

ggplot(crc@meta.data, aes(x = log10(passed_filters), y = pct_reads_in_DNase, color = seurat_clusters)) +
  geom_point() + pretty_plot() + L_border() + labs(x = "# Unique Fragments", y = "% Reads in Peaks")

ggplot(crc@meta.data, aes(x = log10_mtDNA_depth, y = pct_reads_in_DNase, color = seurat_clusters)) +
  geom_point() + pretty_plot() + L_border() + labs(x = "mean mtDNA coverage", y = "% Reads in Peaks")

We can see that most of the low FRIP cells were the myeloid 1 cluster. This is some sort of intra-tumor granulocyte that has relatively poor accessible chromatin enrichment. Due to the unusual chromatin patterns in these cells, this cluster also has relatively low mtDNA coverage.

Find mtDNA variants

All of the above analyses are standard things that one can do with Signac. Now, let’s overlay mtDNA variants onto the analysis.

To do this, we call the function in the R script to find HQ mutations.

source("variant_calling.R") 

# mgatk_se is the Summarized Experiment .rds file
# That is automatically produced from running
# The mgatk CLI python package 

mut_se <- call_mutations_mgatk(mgatk_se)

## [1] "A"
## [1] "C"
## [1] "G"
## [1] "T"

misc_df <- data.frame(rowData(mut_se))

ggplot(misc_df %>% filter(n_cells_conf_detected >= 5), aes(x = strand_correlation, y = log10(vmr))) +
  geom_point() +
  labs(color = "HQ", x = "Pearson correlation (strand)", y = "log VMR") +
  geom_vline(xintercept = 0.65, color = "black", linetype = 2) +
  geom_hline(yintercept = -2, color = "black", linetype = 2) +
  pretty_plot(fontsize = 12) + L_border() +
  theme(legend.position = "bottom")

# Establish a filtered data frame of variants based on this processing
filter_df <- misc_df %>% dplyr::filter(n_cells_conf_detected >= 5 & strand_correlation >= 0.65 & log10(vmr) > -2)

Above, the plot is sort of the standard way to visualize potential variants that could be informative for subclustering. From this execution (which serves as pretty good defaults from my testing), we get 12 variants. Let’s look more closely:

filter_df[,c(1,2,3,5)]

##          position nucleotide  variant      mean
## 1227G>A      1227        G>A  1227G>A 0.0078025
## 3244G>A      3244        G>A  3244G>A 0.0010942
## 6081G>A      6081        G>A  6081G>A 0.0027757
## 9804G>A      9804        G>A  9804G>A 0.0029640
## 12889G>A    12889        G>A 12889G>A 0.0234690
## 824T>C        824        T>C   824T>C 0.0049584
## 2285T>C      2285        T>C  2285T>C 0.0053491
## 9840T>C      9840        T>C  9840T>C 0.0019356
## 12731T>C    12731        T>C 12731T>C 0.0019040
## 16093T>C    16093        T>C 16093T>C 0.0069912
## 9728C>T      9728        C>T  9728C>T 0.0127868
## 16147C>T    16147        C>T 16147C>T 0.6185731

A few things stand out. First, 10 out of the 12 variants occur at less than 1% allele frequency in the population. However, 16147C>T is present at about 62%. We’ll see that this is a clonal variant marking the epithelial cells. Additionally, all of the called variants are transitions. This fits what know about mtDNA mutational frequency.

Visualize a subset of the called variants

Currently, this is pretty messy, but I think that visualizing variants on the reduced dimension embedding is generally useful:

library(cowplot)
mito_df <- data.frame(
  crc@meta.data,
  crc@reductions$umap@cell.embeddings,
  t(assays(mut_se)[["allele_frequency"]][as.character(filter_df$variant),]),
  t(assays(mut_se)[["coverage"]][as.character(filter_df$variant),])
)

plot_mito_umap <- function(mito_df, variant){
  
  # Ugly name that comes from making it a column names
  ugly_var <- paste0("X", gsub(">", ".", variant))
  mito_df$variant_value <- mito_df[,ugly_var]*100
  
  ggplot(mito_df %>% arrange(variant_value), aes(x = UMAP_1, y = UMAP_2, color = variant_value)) +
    geom_point() + theme_bw() +
    scale_color_gradientn(colors = c("lightgrey", "firebrick")) +
    labs(color = variant)
}

plot1 <- plot_mito_umap(mito_df, "16147C>T")
plot2 <- plot_mito_umap(mito_df, "9728C>T")
plot3 <- plot_mito_umap(mito_df, "1227G>A")
plot4 <- plot_mito_umap(mito_df, "12889G>A")
plot5 <- plot_mito_umap(mito_df, "3244G>A")
plot6 <- plot_mito_umap(mito_df, "824T>C")


cowplot::plot_grid(plot1, plot2, plot3, plot4, plot5, plot6, nrow = 3)

Here, we can see a few interesting patterns for the selected variants. 16147C>T is present in essentially all epithelial cells and almost exclusively in epithelial cells (the edge cases where this isn’t true are also cases where the UMAP and clustering don’t full agree). It is at 100% allele frequency– strongly suggestive of whatever cell of origin of this tumor had the mutation at 100% and then expanded.

We then see at least 3 variants 9728C>T, 1227G>A, 12889G>A that are mostly present specifically in the epithelial cells that define subclones. It’s not really enough cells to robustly define subclones however.

The variants 824T>C and 3244G>A are found specifically in immune cell populations, suggesting that these arose from a common hematopoetic progenitor cell (probably in the bone marrow).

Example analysis workflow using mtscATAC-seq data

Caleb Lareau

2020-08-10

Overview