Load in count file

counts_file <- "Data/SHSY5Ycelldiff_Pezzini2016_RawData.tsv"
count_cols <- c("SRR3133925_nodiff_rep1", "SRR3133926_nodiff_rep2", "SRR3133927_nodiff_rep3", "SRR3133928_diff_rep1", "SRR3133929_diff_rep2", "SRR3133930_diff_rep3")
design <- matrix(c(c(1, 1, 1, 0, 0, 0), c(0, 0, 0, 1, 1, 1)), ncol = 2, dimnames = list(c("SRR3133925_nodiff_rep1", "SRR3133926_nodiff_rep2", "SRR3133927_nodiff_rep3", "SRR3133928_diff_rep1", "SRR3133929_diff_rep2", "SRR3133930_diff_rep3"), c("Undifferentiated ", "Differentiated")))
cont.matrix <- matrix(c(c(-1, 1)), ncol = 1, dimnames = list(c("Undifferentiated ", "Differentiated"), c("Differentiated")))
export_cols <- c(c("Gene.ID", "Gene.Name", "SRR3133925_nodiff_rep1", "SRR3133926_nodiff_rep2", "SRR3133927_nodiff_rep3", "SRR3133928_diff_rep1", "SRR3133929_diff_rep2", "SRR3133930_diff_rep3"))

# Maybe filter out samples that are not used in the model
if (FALSE) {
    # Remove columns not used in the comparison
    use.samples <- rowSums((design %*% cont.matrix) != 0) > 0
    use.conditions <- colSums(design[use.samples, ] != 0) > 0

    count_cols <- count_cols[use.samples, drop = F]
    design <- design[use.samples, use.conditions, drop = F]
    cont.matrix <- cont.matrix[use.conditions, , drop = F]
}

# fileEncoding='UTF-8-BOM' should strip the BOM marker FEFF that some windows tools add
x <- read.delim(counts_file,
    sep = "\t",
    check.names = FALSE,
    colClasses = "character",
    na.strings = c(),
    skip = 0,
    fileEncoding = "UTF-8-BOM"
)

Filter data

colnames(x) <- scan(counts_file,
    what = "",
    sep = " ",
    nlines = 1,
    strip.white = F,
    quote = "\"",
    skip = "0",
    fileEncoding = "UTF-8-BOM"
)

x[, count_cols] <- apply(x[, count_cols], 2, function(v) as.numeric(v)) # Force numeric count columns
counts <- x[, count_cols]

# Keep rows based on string based filters of columns.  Rows must match all filters
filter_rows <- fromJSON("[]")
if (length(filter_rows) > 0) {
    keepRows <- apply(apply(filter_rows, 1, function(r) grepl(r["regexp"], x[, r["column"]], perl = T, ignore.case = T)), 1, all)
} else {
    keepRows <- rep(TRUE, nrow(x))
}

keepMin <- apply(counts, 1, max) >= 10.0
keepCpm <- rowSums(cpm(counts) > 0.0) >= 0 # Keep only genes with cpm above x in at least y samples

keep <- keepMin & keepCpm & keepRows
x <- x[keep, ]
counts <- counts[keep, ]

TMM normalisation

nf <- calcNormFactors(counts, method = "TMM")
y <- voom(counts, design, plot = FALSE, lib.size = colSums(counts) * nf)

y$genes <- x$"Gene.ID"
normalized <- y$E
grp1 <- grep("_diff_", colnames(normalized))
grp2 <- grep("_nodiff_", colnames(normalized))
normalized <- normalized[, c(grp1, grp2)] # Re-ordering exp data
row.names(normalized) <- y$genes
write.table(normalized, file = "Data/TMM_normalized.csv", row.names = FALSE, sep = ",", na = "", quote = FALSE)

Write expression data in .gct format

# Create .gct file
normalizedDF <- cbind(description = "NA", normalized)
normalizedDF <- data.frame(normalizedDF) %>%
    rownames_to_column(var = "NAME")
write.table(normalizedDF, file = "Data/TMM_normalized.tsv", row.names = F, sep = "\t", na = "", quote = FALSE)
new_rows <- c("#1.2", paste(nrow(normalized), ncol(normalized), sep = "\t"))
expressed_data <- readLines("Data/TMM_normalized.tsv")
GCT_FILE <- c(new_rows, expressed_data)
writeLines(GCT_FILE, "Data/Expression_Data.gct")

# Create .cls file
sampls <- ncol(normalized)
groups <- 2
labels <- c(rep("Diff", 3), rep("Nodiff", 3))
uniq <- unique(labels)
cls_content <- c(
    paste(sampls, groups, 1), # Number of samples, number of classes, and type
    paste("#", paste(uniq, collapse = " ")), # Class labels
    paste(labels, collapse = " ") # Phenotype labels for each sample
)
writeLines(cls_content, con = "Data/Phenotype_Labels.cls")

Calculate t statistics (for pre-ranked GSEA analysis)

fit <- lmFit(y, design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)

# Ranking
rankings <- data.frame(gene = fit2$genes, t.stat = fit2$t[, 1]) %>%
    arrange(t.stat)
rankings_m <- merge(x[, c("Gene.ID", "Gene.Name")], rankings, by.x = "Gene.ID", by.y = "gene", all.x = TRUE)
rankings_m <- rankings_m %>% arrange(desc(t.stat))
names(rankings_m) <- c("ENSEMBL", "SYMBOL", "t.stat")
write.table(rankings_m, file = "Data/Rankings.csv", row.names = FALSE, sep = ",", na = "", quote = FALSE)
write.table(rankings_m$ENSEMBL, file = "Data/Pre_Ranked_List_ENSEMBL.csv", row.names = FALSE, col.names = FALSE, sep = ",", na = "", quote = FALSE)
write.table(rankings_m$SYMBOL, file = "Data/Pre_Ranked_List_SYMBOL.csv", row.names = FALSE, col.names = FALSE, sep = ",", na = "", quote = FALSE)

Visualisation

plot(rankings_m$t.stat, xlab = "Gene No", ylab = "t-stat")

boxplot(rankings_m$t.stat)

Ranking Genes and Creating .gct & .cls Files

Files required for GSEA analysis in MSigDB or GenePattern

HK

2024-11-12

Load in count file

Filter data

TMM normalisation

Write expression data in .gct format

Calculate t statistics (for pre-ranked GSEA analysis)

Visualisation