Load in count file
counts_file <- "Data/SHSY5Ycelldiff_Pezzini2016_RawData.tsv"
count_cols <- c("SRR3133925_nodiff_rep1", "SRR3133926_nodiff_rep2", "SRR3133927_nodiff_rep3", "SRR3133928_diff_rep1", "SRR3133929_diff_rep2", "SRR3133930_diff_rep3")
design <- matrix(c(c(1, 1, 1, 0, 0, 0), c(0, 0, 0, 1, 1, 1)), ncol = 2, dimnames = list(c("SRR3133925_nodiff_rep1", "SRR3133926_nodiff_rep2", "SRR3133927_nodiff_rep3", "SRR3133928_diff_rep1", "SRR3133929_diff_rep2", "SRR3133930_diff_rep3"), c("Undifferentiated ", "Differentiated")))
cont.matrix <- matrix(c(c(-1, 1)), ncol = 1, dimnames = list(c("Undifferentiated ", "Differentiated"), c("Differentiated")))
export_cols <- c(c("Gene.ID", "Gene.Name", "SRR3133925_nodiff_rep1", "SRR3133926_nodiff_rep2", "SRR3133927_nodiff_rep3", "SRR3133928_diff_rep1", "SRR3133929_diff_rep2", "SRR3133930_diff_rep3"))
# Maybe filter out samples that are not used in the model
if (FALSE) {
# Remove columns not used in the comparison
use.samples <- rowSums((design %*% cont.matrix) != 0) > 0
use.conditions <- colSums(design[use.samples, ] != 0) > 0
count_cols <- count_cols[use.samples, drop = F]
design <- design[use.samples, use.conditions, drop = F]
cont.matrix <- cont.matrix[use.conditions, , drop = F]
}
# fileEncoding='UTF-8-BOM' should strip the BOM marker FEFF that some windows tools add
x <- read.delim(counts_file,
sep = "\t",
check.names = FALSE,
colClasses = "character",
na.strings = c(),
skip = 0,
fileEncoding = "UTF-8-BOM"
)
Filter data
colnames(x) <- scan(counts_file,
what = "",
sep = " ",
nlines = 1,
strip.white = F,
quote = "\"",
skip = "0",
fileEncoding = "UTF-8-BOM"
)
x[, count_cols] <- apply(x[, count_cols], 2, function(v) as.numeric(v)) # Force numeric count columns
counts <- x[, count_cols]
# Keep rows based on string based filters of columns. Rows must match all filters
filter_rows <- fromJSON("[]")
if (length(filter_rows) > 0) {
keepRows <- apply(apply(filter_rows, 1, function(r) grepl(r["regexp"], x[, r["column"]], perl = T, ignore.case = T)), 1, all)
} else {
keepRows <- rep(TRUE, nrow(x))
}
keepMin <- apply(counts, 1, max) >= 10.0
keepCpm <- rowSums(cpm(counts) > 0.0) >= 0 # Keep only genes with cpm above x in at least y samples
keep <- keepMin & keepCpm & keepRows
x <- x[keep, ]
counts <- counts[keep, ]
TMM normalisation
nf <- calcNormFactors(counts, method = "TMM")
y <- voom(counts, design, plot = FALSE, lib.size = colSums(counts) * nf)
y$genes <- x$"Gene.ID"
normalized <- y$E
grp1 <- grep("_diff_", colnames(normalized))
grp2 <- grep("_nodiff_", colnames(normalized))
normalized <- normalized[, c(grp1, grp2)] # Re-ordering exp data
row.names(normalized) <- y$genes
write.table(normalized, file = "Data/TMM_normalized.csv", row.names = FALSE, sep = ",", na = "", quote = FALSE)
Calculate t statistics (for pre-ranked GSEA analysis)
fit <- lmFit(y, design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
# Ranking
rankings <- data.frame(gene = fit2$genes, t.stat = fit2$t[, 1]) %>%
arrange(t.stat)
rankings_m <- merge(x[, c("Gene.ID", "Gene.Name")], rankings, by.x = "Gene.ID", by.y = "gene", all.x = TRUE)
rankings_m <- rankings_m %>% arrange(desc(t.stat))
names(rankings_m) <- c("ENSEMBL", "SYMBOL", "t.stat")
write.table(rankings_m, file = "Data/Rankings.csv", row.names = FALSE, sep = ",", na = "", quote = FALSE)
write.table(rankings_m$ENSEMBL, file = "Data/Pre_Ranked_List_ENSEMBL.csv", row.names = FALSE, col.names = FALSE, sep = ",", na = "", quote = FALSE)
write.table(rankings_m$SYMBOL, file = "Data/Pre_Ranked_List_SYMBOL.csv", row.names = FALSE, col.names = FALSE, sep = ",", na = "", quote = FALSE)
Visualisation
plot(rankings_m$t.stat, xlab = "Gene No", ylab = "t-stat")
boxplot(rankings_m$t.stat)