8 Pre-processing CosMx data

The pre-processing is much similar to single-cell data. Now here we are starting with the seurat object that was downloaded from the AtoMx platform. I will start by loading this Seurat object in.

data_dir <- "./data/" # here the location to final seurat object
tma1 <- readRDS(paste0(data_dir, "seuratObject_TMA1.RDS"))
tma2 <- readRDS(paste0(data_dir,"seuratObject_TMA2.RDS"))
tma1$tma <- rep("TMA1", length(row.names(tma1@meta.data)))
tma2$tma <- rep("TMA2", length(row.names(tma2@meta.data)))

8.1 Doublets ——

Now here you have the option to run a doublet removal function such as DoubletFinder and use that object to filter your original data set. The reason for doublet removal is to take out some cells that are not properly segmented and more than 1 cell is called as 1 cell.

You can use a similar code to below to do the doublet removal. However, I need to note here that, if you want to do doublet removal on a TMA type of experiment, it is important to do doublet removal on a seurat object split into each core or sample and after the doublet removal merge them together for downstream analyses.

# library(DoubletFinder)
# merged <- qread("CosMx_BC/merged_1_2_filtered_clustered.qs")
# 
# sweep.res.list_kidney <- paramSweep(merged, PCs = 1:15, sct = TRUE)
# sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = FALSE)
# pdf("./CosMx_BC/doublets/bcmvn_merged_1_2.pdf")
#     bcmvn_kidney <- find.pK(sweep.stats_kidney)
# dev.off()

# ## Homotypic Doublet Proportion Estimate -------------------------------------------------------------------------------------
# annotations      <- merged@meta.data$seurat_clusters
# homotypic.prop   <- modelHomotypic(annotations)           ## ex: annotations <- so_whole@meta.data$ClusteringResults
# nExp_poi         <- round(0.075*nrow(merged@meta.data))  ## Assuming 7.5% doublet formation rate - tailor for your dataset
# nExp_poi.adj     <- round(nExp_poi*(1-homotypic.prop))

# Run DoubletFinder with varying classification stringencies ----------------------------------------------------------------
# merged           <- doubletFinder(merged, PCs = 1:15, pN = 0.25, pK = 0.09, nExp = nExp_poi, reuse.pANN = FALSE, sct = TRUE)
#so_whole <- doubletFinder_v3(so_whole, PCs = 1:10, pN = 0.25, pK = 0.09, nExp = nExp_poi.adj, reuse.pANN = "pANN_0.25_0.09_913", sct = TRUE)

# pdf("./CosMx_BC/doublets/doublets_merged_1_2.pdf", onefile = TRUE)
#     #DimPlot(merged, group.by = "DF.classifications_0.25_0.09_1100", order = "Doublet")
# dev.off()

# remove the doublets and recluster:
# merged_old <- merged
# merged <- subset(merged, subset = DF.classifications_0.25_0.09_1100 == 'Singlet')
tma1_doub <- qread(data_dir, "doublet_removed_merged_tma1.qs")
tma2_doub <- qread(data_dir, "doublet_removed_merged_tma2.qs")

tma1 <- tma1[, colnames(tma1_doub)]
tma2 <- tma2[, colnames(tma2_doub)]

tma1
tma2

8.2 Merging vs Integrating ——

Next depending on the experimental design you will either merge or integrate the slides. Let’s merge these two slide data from the two slides tma1 ans tma2 together as they ran in the same run. However, for slides from different runs, there is going to be batch effect so mere merging is not recommended. The batch effect needs to be corrected using a package like Harmony or LIGER similar to how you’d do it for single cell data.

merged  <- merge(tma1, tma2)
# optional: save it
qs::qsave(merged, "merged1_2.qs")

# merged <- qread("merged1_2.qs")

merged$fov_tma <- paste0(merged$tma, "_", merged$fov)

# negative probe handling
merged$pc_neg <-  (merged$nCount_negprobes / merged$nCount_RNA ) * 100
merged$avg_neg <-  colMeans(merged[["negprobes"]])

head(merged@meta.data)

8.3 Adding Histological Annotations ——-

Following part of creating a sample sheet is bespoke to each experiment but important. It is to match up the fovs with their biological groups (e.g. the tumour subtype, tumlour location, patient etc). I guess you willl have to create a similar sample_sheet for your data set regardless.

# most cell annotations
anno_table    <- read_csv(paste0(data_dir,"samplesheet_12.csv"))
anno_table$id <- paste0(anno_table$FoV_Name, "_", paste0("TMA",anno_table$TMA))

head(anno_table)

merged$sampleid             <- as.character(paste0(merged@meta.data$fov,"_", merged@meta.data$tma))

merged$Core_ID              <- factor(anno_table$Core_ID[match(merged$sampleid, anno_table$id)])
merged$Location             <- factor(anno_table$Location[match(merged$sampleid, anno_table$id)])
merged$Tumour_type          <- factor(anno_table$Tumour_type[match(merged$sampleid, anno_table$id)])
merged$Histology            <- factor(anno_table$Histology[match(merged$sampleid, anno_table$id)])
merged$Tumour_core_location <- factor(anno_table$Tumour_core_location[match(merged$sampleid, anno_table$id)])
merged$Tumour_core_location <- str_replace(merged$Tumour_core_location, "invasive front", "Invasive front") %>% factor()
merged$Tumour_grade         <- factor(anno_table$Tumour_grade[match(merged$sampleid, anno_table$id)])
merged$Chemo                <- factor(anno_table$Chemo[match(merged$sampleid, anno_table$id)])
merged$tumourtype_loc       <- factor(paste0(merged$Tumour_type , "_", merged$Tumour_core_location))

table(is.na(merged$Tumour_type))

8.4 QC Stats —-

plot_dir <-"./plots/"
dir.create(plot_dir, showWarnings = FALSE)

# config
min_count_per_cell <- 100
max_pc_negs        <- 1.5
max_avg_neg        <- 0.5
num_dims           <- 10
# qc plots
pdf(paste0(plot_dir,"basicqc.pdf"), onefile = TRUE)
ggplot(merged@meta.data, aes(x=nCount_RNA, col=Tumour_type)) +
  geom_density() + 
  geom_vline(xintercept = min_count_per_cell, lty=3) +
  scale_x_log10() +
  theme_bw() +
  ggtitle("Counts per cell")

ggplot(merged@meta.data, aes(x=pc_neg, col=Tumour_type)) +
  geom_density() + 
  geom_vline(xintercept = max_pc_negs, lty=3) +
  scale_x_log10() +
  theme_bw() +
  ggtitle("Negative probe composition")

ggplot(merged@meta.data, aes(x=avg_neg, col=Tumour_type)) +
  geom_density() + 
  geom_vline(xintercept = max_avg_neg, lty=3) +
  theme_bw() +
  ggtitle("Negative probe average")

ggplot(merged@meta.data, aes(y=avg_neg, x=nCount_RNA)) +
  geom_point(pch=3, alpha=0.1) + 
  geom_hline(yintercept = max_avg_neg, lty=3) +
  geom_vline(xintercept = min_count_per_cell, lty=3) +
  scale_x_log10() + 
  theme_bw() +
  ggtitle("Negative probes vs counts")

dev.off()

pdf(paste0(plot_dir,"basicqc_tumourcore.pdf"), onefile = TRUE)
ggplot(merged@meta.data, aes(x=nCount_RNA, col=Tumour_core_location)) +
  geom_density() + 
  geom_vline(xintercept = min_count_per_cell, lty=3) +
  scale_x_log10() +
  theme_bw() +
  ggtitle("Counts per cell")

ggplot(merged@meta.data, aes(x=pc_neg, col=Tumour_core_location)) +
  geom_density() + 
  geom_vline(xintercept = max_pc_negs, lty=3) +
  scale_x_log10() +
  theme_bw() +
  ggtitle("Negative probe composition")

ggplot(merged@meta.data, aes(x=avg_neg, col=Tumour_core_location)) +
  geom_density() + 
  geom_vline(xintercept = max_avg_neg, lty=3) +
  theme_bw() +
  ggtitle("Negative probe average")

dev.off()

pdf(paste0(plot_dir,"basicqc_tumourtypeloc.pdf"), onefile = TRUE)
ggplot(merged@meta.data, aes(x=nCount_RNA, col=tumourtype_loc)) +
  geom_density() + 
  geom_vline(xintercept = min_count_per_cell, lty=3) +
  scale_x_log10() +
  theme_bw() +
  ggtitle("Counts per cell")

ggplot(merged@meta.data, aes(x=pc_neg, col=tumourtype_loc)) +
  geom_density() + 
  geom_vline(xintercept = max_pc_negs, lty=3) +
  scale_x_log10() +
  theme_bw() +
  ggtitle("Negative probe composition")

ggplot(merged@meta.data, aes(x=avg_neg, col=tumourtype_loc)) +
  geom_density() + 
  geom_vline(xintercept = max_avg_neg, lty=3) +
  theme_bw() +
  ggtitle("Negative probe average")

dev.off()

# Grab the negative probes
negprb_1 <- merged@assays$negprobes@counts
rownames(negprb_1)

v1 <- VlnPlot(merged, group.by = "Tumour_type", features = c("nCount_RNA", "nFeature_RNA"), pt.size = 0.1) + NoLegend()
v2 <- VlnPlot(merged, group.by = "Tumour_type", features = c("nCount_negprobes", "nFeature_negprobes"), pt.size = 0.1) + NoLegend()

pdf(paste0(plot_dir,"count_feature_violin.pdf"))
wrap_plots(v1, v2, ncol = 1)
dev.off()

Now the function ImageDimPlot will let you visualise a bunch of FOs or the whole slide.

tmas <- c("TMA.1.of.2", "TMA.2.of.2")
# looking at fovs separaely on tma1 and 2
pdf(paste0(plot_dir,"initial_imagedimplots.pdf"), onefile = TRUE)
lapply(tmas,function(x){
  ImageDimPlot(merged, fov = x, axes = TRUE, group.by = "tumourtype_loc", cols = "glasbey") + ggtitle(x)
})
dev.off()

8.5 Filtering and Cleaning Up —–

Again, there is ot hard and fast rule for filtering. You can have.a few iterations of tabve plots with different levels of filtering to figure out where the cutoff needs to be.

# Basic filtering
merged <- subset(merged, 
                subset = nCount_RNA < 7500 & nCount_RNA > 200 & nFeature_RNA > 100 & nFeature_RNA < 3500) merged <- merged[, colSums(merged) >=20]

Here it needs to be noted that there are no mitochondrial or cell cycle genes to regress for CosMx data because those genes are not in the panel.