## To install needed CRAN packages:
#
# install.packages("tidyverse")
# install.packages("multcomp")
#
## To install needed Bioconductor packages:
#
# source("https://bioconductor.org/biocLite.R")
# biocLite(c("limma", "edgeR"))
#
library(MASS) # ginv -- coefficient estimation
library(splines) # ns, bs -- spline curves
library(multcomp) # glht -- linear hypotheses
library(edgeR) # cpm, etc -- RNA-Seq normalization
library(limma) # lmFit, etc -- fitting many models
library(tidyverse) # working with data frames, plotting
Much of what we will be using is built into R without loading any packages.
The age (year) and height (cm) of 10 people has been measured. We want a model that can predict height based on age.
people <- read_csv(
"age, height
10, 131
14, 147
16, 161
9, 136
16, 170
15, 160
15, 153
21, 187
9, 145
21, 195")
ggplot(people, aes(x=age, y=height)) + geom_point()
fit <- lm(height ~ 1+age, data=people)
fit
##
## Call:
## lm(formula = height ~ 1 + age, data = people)
##
## Coefficients:
## (Intercept) age
## 92.612 4.513
lm(height ~ age, data=people) # Implicit intercept term
##
## Call:
## lm(formula = height ~ age, data = people)
##
## Coefficients:
## (Intercept) age
## 92.612 4.513
lm(height ~ 0+age, data=people) # Use 0 or -1 to omit
##
## Call:
## lm(formula = height ~ 0 + age, data = people)
##
## Coefficients:
## age
## 10.39
Coefficients are extracted with coef
:
coef(fit)
## (Intercept) age
## 92.611502 4.512911
The residual standard deviation is extracted with sigma
:
sigma(fit)
## [1] 7.263536
Behind the scenes a matrix of predictors has been produced from the formula notation ~ 1+age
. We can examine it explicitly:
model.matrix(fit)
model.matrix
can be used without first calling lm
.
model.matrix(~ 1+age, data=people)
## (Intercept) age
## 1 1 10
## 2 1 14
## 3 1 16
## 4 1 9
## 5 1 16
## 6 1 15
## 7 1 15
## 8 1 21
## 9 1 9
## 10 1 21
## attr(,"assign")
## [1] 0 1
n=10 observations minus p=2 columns in the model matrix leaves 8 residual degrees of freedom:
df.residual(fit)
## [1] 8
predict
predicts. By default it produces predictions on the original dataset.
predict(fit)
## 1 2 3 4 5 6 7 8
## 137.7406 155.7923 164.8181 133.2277 164.8181 160.3052 160.3052 187.3826
## 9 10
## 133.2277 187.3826
predict(fit, interval="confidence")
## fit lwr upr
## 1 137.7406 129.8100 145.6712
## 2 155.7923 150.4399 161.1446
## 3 164.8181 159.2250 170.4111
## 4 133.2277 124.3009 142.1545
## 5 164.8181 159.2250 170.4111
## 6 160.3052 154.9836 165.6267
## 7 160.3052 154.9836 165.6267
## 8 187.3826 177.6105 197.1547
## 9 133.2277 124.3009 142.1545
## 10 187.3826 177.6105 197.1547
We can also calculate predictions manually.
# Prediction for a 15-year old
x <- c(1, 15)
beta <- coef(fit)
sum(x * beta)
## [1] 160.3052
# Prediction for all original data
X <- model.matrix(fit)
as.vector( X %*% beta )
## [1] 137.7406 155.7923 164.8181 133.2277 164.8181 160.3052 160.3052
## [8] 187.3826 133.2277 187.3826
predict
can be used with new data.
new_people <- data_frame(age=5:25)
predict(fit, new_people)
## 1 2 3 4 5 6 7 8
## 115.1761 119.6890 124.2019 128.7148 133.2277 137.7406 142.2535 146.7664
## 9 10 11 12 13 14 15 16
## 151.2793 155.7923 160.3052 164.8181 169.3310 173.8439 178.3568 182.8697
## 17 18 19 20 21
## 187.3826 191.8955 196.4085 200.9214 205.4343
new_predictions <- cbind(
new_people,
predict(fit, new_people, interval="confidence"))
ggplot() +
geom_ribbon(aes(x=age, ymin=lwr, ymax=upr), data=new_predictions, fill="grey") +
geom_line(aes(x=age, y=fit), data=new_predictions, color="blue") +
geom_point(aes(x=age, y=height), data=people) +
labs(y="height (cm)", x="age (year)",
subtitle="Ribbon shows 95% confidence interval of the model")
If you have ever used geom_smooth
, it should now be a little less mysterious.
ggplot(people, aes(x=age, y=height)) + geom_smooth(method="lm") + geom_point()
The residuals are the differences between predicted and actual values.
residuals(fit)
## 1 2 3 4 5 6
## -6.7406103 -8.7922535 -3.8180751 2.7723005 5.1819249 -0.3051643
## 7 8 9 10
## -7.3051643 -0.3826291 11.7723005 7.6173709
plot(predict(fit), residuals(fit))
Residuals should be close to normally distributed.
qqnorm(residuals(fit))
qqline(residuals(fit))
plot(fit)
produces a series of more sophisticated diagnostic plots.
plot(fit)
The anova( )
function performs an F-test to compare two models. ANOVA means ANalysis Of VAriance, because the test compares the residual variances of the two models.
The simpler model formula must nest inside the more complex model formula, or the test will be invalid: any model possible with the simpler formula must have an equivalent with the more complex formula. The test tells us if we have grounds to reject the simpler model formula.
fit_noslope <- lm(height ~ 1, data=people)
anova(fit_noslope, fit)
## Analysis of Variance Table
##
## Model 1: height ~ 1
## Model 2: height ~ 1 + age
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 9 3892.5
## 2 8 422.1 1 3470.4 65.779 3.956e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit_nointercept <- lm(height ~ 0+age, data=people)
anova(fit_nointercept, fit)
## Analysis of Variance Table
##
## Model 1: height ~ 0 + age
## Model 2: height ~ 1 + age
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 9 6770.9
## 2 8 422.1 1 6348.8 120.34 4.236e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary( )
is a quick and dirty way to compare a model to models with each of the terms dropped in turn.
summary(fit)
##
## Call:
## lm(formula = height ~ 1 + age, data = people)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7923 -6.0100 -0.3439 4.5795 11.7723
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92.6115 8.4424 10.97 4.24e-06 ***
## age 4.5129 0.5564 8.11 3.96e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.264 on 8 degrees of freedom
## Multiple R-squared: 0.8916, Adjusted R-squared: 0.878
## F-statistic: 65.78 on 1 and 8 DF, p-value: 3.956e-05
Note that the p-values match those from the calls to anova( )
.
Finally, confint( )
gives confidence intervals for coefficients. If a coefficient had p-value 0.05 in the above tests, the confidence interval would just touch zero.
confint(fit)
## 2.5 % 97.5 %
## (Intercept) 73.143293 112.079712
## age 3.229773 5.796049
y <- c(5,3,2,4,1)
fit0 <- lm(y ~ 0)
fit1 <- lm(y ~ 1)
anova(fit0, fit1)
Describe the null hypothesis (fit0).
Describe the alternative hypothesis (fit1).
What have we shown with this test?
Consider a simple experiment where two treatments are compared with a control group. This is called a one-way ANOVA experiment. This is another usage of the term ANOVA! The F-test we just used was developed by Fisher in the context of this type of model.
outcomes <- read_csv(
"group, outcome
control, 5.46
control, 2.06
control, 2.74
control, 6.02
a, 8.31
a, 11.75
a, 6.13
a, 10.59
b, 1.02
b, 3.69
b, 2.52")
outcomes$group <- factor(outcomes$group, c("control", "a", "b"))
ggplot(outcomes, aes(x=group, y=outcome)) + geom_point()
outfit <- lm(outcome ~ group, data=outcomes)
outfit
##
## Call:
## lm(formula = outcome ~ group, data = outcomes)
##
## Coefficients:
## (Intercept) groupa groupb
## 4.070 5.125 -1.660
df.residual(outfit)
## [1] 8
sigma(outfit)
## [1] 2.054802
model.matrix(outfit)
## (Intercept) groupa groupb
## 1 1 0 0
## 2 1 0 0
## 3 1 0 0
## 4 1 0 0
## 5 1 1 0
## 6 1 1 0
## 7 1 1 0
## 8 1 1 0
## 9 1 0 1
## 10 1 0 1
## 11 1 0 1
## attr(,"assign")
## [1] 0 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
It’s important to understand how the factor has been encoded. It’s a bit weird, but it’s a system that extends well to models with multiple factors. The factor is encoded as “indicator variables” for all but the first level in the factor (“one-hot encoding”). This means:
(Intercept)
represents the average for the control groupgroupa
is the difference from the control to group “a”groupb
is the difference from the control to group “b”If this becomes confusing, it’s possible to directly examine how coefficients are fitted, which is by multiplying by the “Moore-Penrose generalized inverse” of the model matrix.
library(MASS)
X <- model.matrix(~ group, data=outcomes)
ginv(X) %>% round(2)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
## [1,] 0.25 0.25 0.25 0.25 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## [2,] -0.25 -0.25 -0.25 -0.25 0.25 0.25 0.25 0.25 0.00 0.00 0.00
## [3,] -0.25 -0.25 -0.25 -0.25 0.00 0.00 0.00 0.00 0.33 0.33 0.33
ginv(X) %*% outcomes$outcome #Confirm estimates are the same
## [,1]
## [1,] 4.070
## [2,] 5.125
## [3,] -1.660
An F test comparing to a null hypothesis model with just an intercept term tells us if there are any detectable differences between groups.
outfit_null <- lm(outcome ~ 1, data=outcomes)
anova(outfit_null, outfit)
## Analysis of Variance Table
##
## Model 1: outcome ~ 1
## Model 2: outcome ~ group
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 10 125.210
## 2 8 33.778 2 91.432 10.828 0.005296 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
What about between specific pairs of groups? summary( )
(and confint( )
) give us two of the answers, “a” vs “control” and “b” vs “control”.
summary(outfit)
##
## Call:
## lm(formula = outcome ~ group, data = outcomes)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.065 -1.360 0.110 1.393 2.555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.070 1.027 3.961 0.00417 **
## groupa 5.125 1.453 3.527 0.00776 **
## groupb -1.660 1.569 -1.058 0.32106
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.055 on 8 degrees of freedom
## Multiple R-squared: 0.7302, Adjusted R-squared: 0.6628
## F-statistic: 10.83 on 2 and 8 DF, p-value: 0.005296
confint(outfit)
## 2.5 % 97.5 %
## (Intercept) 1.700809 6.439191
## groupa 1.774458 8.475542
## groupb -5.278999 1.958999
What about “b” vs “a”? For this we need a linear hypothesis test.
coef(outfit)[3] - coef(outfit)[2]
## groupb
## -6.785
library(multcomp)
K <- rbind(c(0, -1, 1))
K %*% coef(outfit)
## [,1]
## [1,] -6.785
result <- glht(outfit, K)
summary(result)
##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: lm(formula = outcome ~ group, data = outcomes)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## 1 == 0 -6.785 1.569 -4.323 0.00253 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
confint(result)
##
## Simultaneous Confidence Intervals
##
## Fit: lm(formula = outcome ~ group, data = outcomes)
##
## Quantile = 2.306
## 95% family-wise confidence level
##
##
## Linear Hypotheses:
## Estimate lwr upr
## 1 == 0 -6.7850 -10.4040 -3.1660
multcomp
can be given multiple rows to test in K
. In this case, by default, it does a multiple testing correction to maintain the Family-Wise Error Rate using a generalization of Tukey’s Honestly Significant Differences.
There’s a further rule to learn about how R deals with factors in linear models. If the intercept is omitted, R produces a predictor for each level in the factor (rather than all but the first).
outfit2 <- lm(outcome ~ 0+group, data=outcomes)
outfit2
##
## Call:
## lm(formula = outcome ~ 0 + group, data = outcomes)
##
## Coefficients:
## groupcontrol groupa groupb
## 4.070 9.195 2.410
model.matrix(outfit2)
## groupcontrol groupa groupb
## 1 1 0 0
## 2 1 0 0
## 3 1 0 0
## 4 1 0 0
## 5 0 1 0
## 6 0 1 0
## 7 0 1 0
## 8 0 1 0
## 9 0 0 1
## 10 0 0 1
## 11 0 0 1
## attr(,"assign")
## [1] 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
This fit is equivalent to the original outfit
, but the coefficients have different meanings.
What are the meanings of the three coefficients in this alternative model?
What would be the linear hypotheses to test differences between each pair of groups?
Tooth growth in mouse embryos is studied using RNA-Seq. The RNA expression levels of several genes are examined in the cells that form the upper and lower first molars, in eight individual mouse embryos that have been dissected after different times of embryo development. The measurements are in terms of “Reads Per Million”, essentially the fraction of RNA in each sample belonging to each gene, times 1 million.
(This data was extracted from ARCHS4 (https://amp.pharm.mssm.edu/archs4/). In the Gene Expression Omnibus it is entry GSE76316. The sample descriptions in GEO seem to be out of order, but reading the associated paper and the genes they talk about I think I have the correct order of samples!)
teeth <- read_csv("data/linearModels/teeth.csv")
## Parsed with column specification:
## cols(
## sample = col_character(),
## mouse = col_character(),
## day = col_double(),
## tooth = col_character(),
## gene_ace = col_double(),
## gene_smoc1 = col_double(),
## gene_pou3f3 = col_double(),
## gene_wnt2 = col_double()
## )
teeth$tooth <- factor(teeth$tooth, c("lower","upper"))
teeth$mouse <- factor(teeth$mouse)
It will be convenient to have a quick way to examine different genes and different models with this data.
# A convenience to examine different model fits
more_data <- expand.grid(
day=seq(14.3,18.2,by=0.01),
tooth=as_factor(c("lower","upper")))
look <- function(y, fit=NULL) {
p <- ggplot(teeth,aes(x=day,group=tooth))
if (!is.null(fit)) {
more_ci <- cbind(
more_data,
predict(fit, more_data, interval="confidence"))
p <- p +
geom_ribbon(data=more_ci, aes(ymin=lwr,ymax=upr),alpha=0.1) +
geom_line(data=more_ci,aes(y=fit,color=tooth))
}
p + geom_point(aes(y=y,color=tooth)) +
labs(y=deparse(substitute(y)))
}
# Try it out
look(teeth$gene_ace)
We could treat day
as a categorical variable, as in the previous section. However let us treat it as numerical, and see where that leads.
acefit <- lm(gene_ace ~ tooth + day, data=teeth)
look(teeth$gene_ace, acefit)
Two problems:
In this case, log transformation of the data will solve both these problems.
log2_acefit <- lm( log2(gene_ace) ~ tooth+day, data=teeth)
look(log2(teeth$gene_ace), log2_acefit)
Various transformations of y are possible. Log transformation is commonly used in the context of gene expression. Square root transformation can also be appropriate with nicely behaved count data (technically, if the errors follow a Poisson distribution). This gene expression data is ultimately count based, but is overdispersed compared to the Poisson distribution so square root transformation isn’t appropriate in this case.
In the case of the Pou3f3 gene, the log transformation is even more important. It looks like gene expression changes at different rates in the upper and lower molars, that is there is a significant interaction between tooth
and day
.
pou3f3fit0 <- lm(gene_pou3f3 ~ tooth+day, data=teeth)
look(teeth$gene_pou3f3, pou3f3fit0)
pou3f3fit1 <- lm(gene_pou3f3 ~ tooth*day, data=teeth)
look(teeth$gene_pou3f3, pou3f3fit1)
anova(pou3f3fit0, pou3f3fit1)
## Analysis of Variance Table
##
## Model 1: gene_pou3f3 ~ tooth + day
## Model 2: gene_pou3f3 ~ tooth * day
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 13 4849.7
## 2 12 415.2 1 4434.5 128.15 9.234e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Examining the residuals reveals a further problem: larger expression values are associated with larger residuals.
look(residuals(pou3f3fit1))
plot(predict(pou3f3fit1), residuals(pou3f3fit1))
qqnorm(residuals(pou3f3fit1))
qqline(residuals(pou3f3fit1))
Log transformation both removes the interaction and makes the residuals more uniform (except for one outlier).
log2_pou3f3fit0 <- lm(log2(gene_pou3f3) ~ tooth+day, data=teeth)
log2_pou3f3fit1 <- lm(log2(gene_pou3f3) ~ tooth*day, data=teeth)
anova(log2_pou3f3fit0, log2_pou3f3fit1)
## Analysis of Variance Table
##
## Model 1: log2(gene_pou3f3) ~ tooth + day
## Model 2: log2(gene_pou3f3) ~ tooth * day
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 13 0.56714
## 2 12 0.56579 1 0.0013576 0.0288 0.8681
look(log2(teeth$gene_pou3f3), log2_pou3f3fit0)
qqnorm(residuals(log2_pou3f3fit0))
qqline(residuals(log2_pou3f3fit0))
Look at the expression of gene Wnt2 in column gene_wnt2
.
Try some different model formulas.
Justify a particular model by rejecting simpler alternatives using anova( )
. For example, can we reject that there is no interaction between tooth and day?
In this section we look at fitting the same matrix of predictors X to many different sets of responses y. We will use the package limma
from Bioconductor. This is a very brief demonstration, and there is much more to this package. See the excellent usersguide.pdf at https://bioconductor.org/packages/release/bioc/html/limma.html
Actually in the teeth dataset, the expression level of all genes was measured!
counts_df <- read_csv("data/linearModels/teeth-read-counts.csv")
counts <- as.matrix(counts_df[,-1])
rownames(counts) <- counts_df[[1]]
dim(counts)
## [1] 32544 16
counts[1:5,]
## ind1lower ind1upper ind2lower ind2upper ind3lower ind3upper
## 0610007P14Rik 1721 1846 1708 1877 1443 1534
## 0610009B22Rik 512 466 554 504 501 498
## 0610009L18Rik 68 71 121 129 74 124
## 0610009O20Rik 2583 2797 2840 2975 2675 2690
## 0610010F05Rik 1079 1246 1262 992 1232 992
## ind4lower ind4upper ind5lower ind5upper ind6lower ind6upper
## 0610007P14Rik 1389 1439 1699 1881 1697 1716
## 0610009B22Rik 478 427 569 614 577 568
## 0610009L18Rik 83 87 86 125 99 135
## 0610009O20Rik 2429 2313 2693 3093 2854 3066
## 0610010F05Rik 1011 977 1175 1384 1235 1221
## ind7lower ind7upper ind8lower ind8upper
## 0610007P14Rik 1531 1625 1716 1373
## 0610009B22Rik 587 531 611 453
## 0610009L18Rik 130 132 114 92
## 0610009O20Rik 2487 2904 2713 2455
## 0610010F05Rik 1042 998 1079 882
The column names match our teeth
data frame.
teeth$sample
## [1] "ind1lower" "ind1upper" "ind2lower" "ind2upper" "ind3lower"
## [6] "ind3upper" "ind4lower" "ind4upper" "ind5lower" "ind5upper"
## [11] "ind6lower" "ind6upper" "ind7lower" "ind7upper" "ind8lower"
## [16] "ind8upper"
A usual first step in RNA-Seq analysis is to convert read counts to Reads Per Million, and log2 transform the results. There are some subtleties here which we breeze over lightly: “TMM” normalization is used as a small adjustment to the total number of reads in each sample. A small constant is added to the counts to avoid calculating log2(0)
. The edgeR
and limma
manuals describe these steps in more detail.
library(edgeR)
library(limma)
dgelist <- calcNormFactors(DGEList(counts))
dgelist$samples
## group lib.size norm.factors
## ind1lower 1 37902941 0.9941578
## ind1upper 1 39786308 1.0028804
## ind2lower 1 43133654 0.9984159
## ind2upper 1 40974683 0.9950708
## ind3lower 1 41135198 1.0000633
## ind3upper 1 39615728 1.0011691
## ind4lower 1 37762217 0.9974437
## ind4upper 1 36832933 1.0048675
## ind5lower 1 41669201 1.0037162
## ind5upper 1 49717562 0.9967401
## ind6lower 1 44203546 1.0008790
## ind6upper 1 45924751 1.0041814
## ind7lower 1 39920915 1.0071850
## ind7upper 1 42882436 1.0131047
## ind8lower 1 44664204 0.9855638
## ind8upper 1 40780553 0.9948624
log2_cpms <- cpm(dgelist, log=TRUE, prior.count=0.25)
There is little chance of detecting differential expression in genes with very low read counts. Including these genes will require a larger False Discovery Rate correction, and also confuses limma’s Empirical Bayes parameter estimation. Let’s only retain genes with around 5 reads per sample or more.
keep <- rowMeans(log2_cpms) >= -3
log2_cpms_filtered <- log2_cpms[keep,]
nrow(log2_cpms)
## [1] 32544
nrow(log2_cpms_filtered)
## [1] 17184
We use limma to fit a linear model to each gene. The same model formula will be used in each case. limma doesn’t automatically convert a formula into a model matrix, so we have to do this step manually. Here I am using a model formula that treats the upper and lower teeth as following a different linear trend over time.
X <- model.matrix(~ tooth*day, data=teeth)
X
## (Intercept) toothupper day toothupper:day
## 1 1 0 14.5 0.0
## 2 1 1 14.5 14.5
## 3 1 0 15.0 0.0
## 4 1 1 15.0 15.0
## 5 1 0 15.5 0.0
## 6 1 1 15.5 15.5
## 7 1 0 16.0 0.0
## 8 1 1 16.0 16.0
## 9 1 0 16.5 0.0
## 10 1 1 16.5 16.5
## 11 1 0 17.0 0.0
## 12 1 1 17.0 17.0
## 13 1 0 17.5 0.0
## 14 1 1 17.5 17.5
## 15 1 0 18.0 0.0
## 16 1 1 18.0 18.0
## attr(,"assign")
## [1] 0 1 2 3
## attr(,"contrasts")
## attr(,"contrasts")$tooth
## [1] "contr.treatment"
fit <- lmFit(log2_cpms_filtered, X)
class(fit)
## [1] "MArrayLM"
## attr(,"package")
## [1] "limma"
fit$coefficients[1:5,]
## (Intercept) toothupper day toothupper:day
## 0610007P14Rik 5.8166503 1.3602289 -0.03253986 -0.08309585
## 0610009B22Rik 3.1429659 0.6444600 0.03628325 -0.04855614
## 0610009L18Rik -0.9210806 1.5433085 0.13073928 -0.08390279
## 0610009O20Rik 6.6202820 0.2793953 -0.03744979 -0.01505784
## 0610010F05Rik 5.8290439 0.2983386 -0.06415423 -0.02496738
Significance testing in limma is by the use of linear hypotheses (which limma refers to as “contrasts”). A difference between glht
and limma’s contrasts.fit
is that limma expects the contrasts in columns rather than rows.
We will first look for genes where the slope over time is not flat, averaging the lower and upper teeth.
# Lower slope: c(0,0,1,0)
# Upper slope: c(0,0,1,1)
K <- rbind(c(0,0,1,0.5))
cfit <- contrasts.fit(fit, t(K)) #linear hypotheses in columns!
efit <- eBayes(cfit, trend=TRUE)
The call to eBayes
does Emprical Bayes squeezing of the residual variance for each gene (see final section). This is a bit of magic that allows limma to work well with small numbers of samples.
topTable(efit)
## logFC AveExpr t P.Value adj.P.Val B
## Rnf144b 0.6691463 4.482150 33.59773 8.443061e-16 1.450856e-11 25.92108
## Six2 -0.5596996 6.978090 -29.65927 5.575830e-15 3.897189e-11 24.30201
## Tfap2b -0.6436429 7.468218 -29.10993 7.395514e-15 3.897189e-11 24.05339
## Prom2 0.9160517 3.673827 28.71874 9.071668e-15 3.897189e-11 23.87259
## Ace 0.9868018 2.807906 27.99265 1.335035e-14 4.588250e-11 23.52847
## Vldlr 0.4128294 6.813937 27.62111 1.633011e-14 4.676943e-11 23.34794
## Stk32a 1.2264479 3.705238 27.22394 2.031340e-14 4.986650e-11 23.15153
## Bambi 0.5594820 6.103750 26.42243 3.185674e-14 6.842827e-11 22.74397
## Msx2 0.8109292 6.252888 25.19599 6.508960e-14 1.176579e-10 22.08980
## Igf2bp1 -0.5403002 5.915267 -25.11120 6.846944e-14 1.176579e-10 22.04313
The column adj.P.Val
contains FDR adjusted p-values.
all_results <- topTable(efit, n=Inf)
significant <- all_results$adj.P.Val <= 0.05
table(significant)
## significant
## FALSE TRUE
## 10471 6713
ggplot(all_results, aes(x=AveExpr, y=logFC)) +
geom_point(size=0.1, color="grey") +
geom_point(data=all_results[significant,], size=0.1)
Let’s look at a specific gene.
rnf144b <- log2_cpms["Rnf144b",]
rnf144b_fit <- lm(rnf144b ~ tooth * day, data=teeth)
look(rnf144b, rnf144b_fit)
We can use the same linear hypothesis with glht
. The estimate is the same, but limma has gained some power by shrinking the variance toward the trend line, so limma’s p-value is smaller.
summary( glht(rnf144b_fit, K) )
##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: lm(formula = rnf144b ~ tooth * day, data = teeth)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## 1 == 0 0.6692 0.0187 35.78 1.45e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
Confidence Intervals should also be of interest. However note that these are not adjusted for multiple testing (see final section).
topTable(efit, confint=0.95)
## logFC CI.L CI.R AveExpr t P.Value
## Rnf144b 0.6691463 0.6267819 0.7115106 4.482150 33.59773 8.443061e-16
## Six2 -0.5596996 -0.5998402 -0.5195591 6.978090 -29.65927 5.575830e-15
## Tfap2b -0.6436429 -0.6906748 -0.5966109 7.468218 -29.10993 7.395514e-15
## Prom2 0.9160517 0.8482026 0.9839007 3.673827 28.71874 9.071668e-15
## Ace 0.9868018 0.9118167 1.0617869 2.807906 27.99265 1.335035e-14
## Vldlr 0.4128294 0.3810374 0.4446215 6.813937 27.62111 1.633011e-14
## Stk32a 1.2264479 1.1306211 1.3222747 3.705238 27.22394 2.031340e-14
## Bambi 0.5594820 0.5144416 0.6045225 6.103750 26.42243 3.185674e-14
## Msx2 0.8109292 0.7424686 0.8793898 6.252888 25.19599 6.508960e-14
## Igf2bp1 -0.5403002 -0.5860676 -0.4945327 5.915267 -25.11120 6.846944e-14
## adj.P.Val B
## Rnf144b 1.450856e-11 25.92108
## Six2 3.897189e-11 24.30201
## Tfap2b 3.897189e-11 24.05339
## Prom2 3.897189e-11 23.87259
## Ace 4.588250e-11 23.52847
## Vldlr 4.676943e-11 23.34794
## Stk32a 4.986650e-11 23.15153
## Bambi 6.842827e-11 22.74397
## Msx2 1.176579e-10 22.08980
## Igf2bp1 1.176579e-10 22.04313
limma can also test several simultaneous constraints on linear combinations of coefficients. Suppose we want to find any deviation from a constant expression level. We can check for this with:
K2 <- rbind(
c(0,1,0,0),
c(0,0,1,0),
c(0,0,0,1))
cfit2 <- contrasts.fit(fit, t(K2))
efit2 <- eBayes(cfit2, trend=TRUE)
topTable(efit2)
## Coef1 Coef2 Coef3 AveExpr F P.Value
## Pou3f3 4.0047206 -0.3380707 -0.01608095 5.129071 504.0972 1.484696e-15
## Six2 -0.7942430 -0.6091343 0.09886927 6.978090 413.1779 6.717734e-15
## Rnf144b -1.4417453 0.6361262 0.06604010 4.482150 398.9320 8.764172e-15
## Tfap2b -4.3123192 -0.7918207 0.29635558 7.468218 330.3518 3.652672e-14
## Prom2 0.6123007 0.9616841 -0.09126493 3.673827 322.9195 4.337796e-14
## Vldlr 2.2398523 0.4745006 -0.12334239 6.813937 275.7521 1.427338e-13
## Ace -0.4928863 0.9802659 0.01307194 2.807906 265.2266 1.913299e-13
## Stk32a 0.8858069 1.2654438 -0.07799180 3.705238 251.8520 2.823707e-13
## Bambi 1.5850479 0.6165574 -0.11415072 6.103750 245.4526 3.426369e-13
## Ptprt 3.5394970 0.8807989 -0.18751019 3.087069 216.1003 8.909622e-13
## adj.P.Val
## Pou3f3 2.551301e-11
## Six2 5.020118e-11
## Rnf144b 5.020118e-11
## Tfap2b 1.490814e-10
## Prom2 1.490814e-10
## Vldlr 4.087896e-10
## Ace 4.696876e-10
## Stk32a 6.065323e-10
## Bambi 6.542081e-10
## Ptprt 1.388992e-09
A shortcut would be to use contrasts.fit(fit, coefficients=2:4)
here instead, or to specify a set of coefficients directly to topTable( )
. The results would be the same.
Construct and use linear combinations of coefficients to find genes that:
Differ in slope between lower and upper molars.
Differ in expression on day 16 between the lower and upper molars.
Hint: the null hypothesis for 2 can be specified as saying the difference in predictions between two particular samples is zero.
Not all of the genes in our teeth data have a straight line relationship with time. Returning to the original data frame, let’s look at the Smoc1 gene.
log2_smoc1fit <- lm(log2(gene_smoc1) ~ tooth + day, data=teeth)
look(log2(teeth$gene_smoc1), log2_smoc1fit)
In this case, log transformation does not remove the curve. If you think this is a problem for linear models, you are mistaken! With a little feature engineering we can fit a quadratic curve. Calculations can be included in the formula if wrapped in I( )
:
curved_fit <- lm(log2(gene_smoc1) ~ tooth + day + I(day^2), data=teeth)
look(log2(teeth$gene_smoc1), curved_fit)
Another way to do this would be to add the column to the data frame:
teeth$day_squared <- teeth$day^2
curved_fit2 <- lm(log2(gene_smoc1) ~ tooth + day + day_squared, data=teeth)
Finally, the poly( )
function can be used in a formula to fit polynomials of arbitrary degree. poly
will encode day
slightly differently, but produces an equivalent fit.
curved_fit3 <- lm(log2(gene_smoc1) ~ tooth + poly(day,2), data=teeth)
sigma(curved_fit)
## [1] 0.1630425
sigma(curved_fit2)
## [1] 0.1630425
sigma(curved_fit3)
## [1] 0.1630425
poly( )
can also be used to fit higher order polynomials, but these tend to become wobbly and extrapolate poorly. A better option may be to use the ns( )
or bs( )
functions in the splines
package, which can be used to fit piecewise “B-splines”. In particular ns( )
(natural spline) is appealing because it extrapolates beyond the ends only with straight lines. If the data is cyclic (for example cell cycle or circadian time series), sine and cosine terms can be used to fit some number of harmonics from a Fourier series.
library(splines)
spline_fit <- lm(log2(gene_smoc1) ~ tooth * ns(day,3), data=teeth)
look(log2(teeth$gene_smoc1), spline_fit)
All of these approaches can also be used with limma (or edgeR or DESeq2).
In limma, Empirical Bayes squeezing of the residual variance acts as though we have some number of extra “prior” observations of the variance. These are also counted as extra degrees of freedom in F tests. The “prior” observations act to squeeze the estimated residual variance of each gene toward a trend line that is a function of the average expression level. (There is also an alternative approach that estimates variance at the level of individual observations rather than genes, called voom
.)
efit <- eBayes(cfit, trend=TRUE)
efit$df.prior
## [1] 3.360083
efit$df.residual
## [1] 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12
## [ reached getOption("max.print") -- omitted 17163 entries ]
efit$df.total
## [1] 15.36008 15.36008 15.36008 15.36008 15.36008 15.36008 15.36008
## [8] 15.36008 15.36008 15.36008 15.36008 15.36008 15.36008 15.36008
## [15] 15.36008 15.36008 15.36008 15.36008 15.36008 15.36008 15.36008
## [ reached getOption("max.print") -- omitted 17163 entries ]
plotSA(efit)
points(efit$Amean, efit$s2.post^0.25, col="red", cex=0.2)
The total effective degrees of freedom is the “prior” degrees of freedom plus the normal residual degrees of freedom. As can be seen in the plot, compared to the residual variance (black dots), this produces a posterior residual variance (efit$s2.post
, red dots) that is squeezed toward the trend line.
It’s worthwhile checking df.prior
when using limma, as a low value may indicate a problem with a data-set.
We noted the CIs produced by limma were not adjusted for multiple testing. A False Coverage-statement Rate (FCR) corrected CI can be constructed corresponding to a set of genes judged significant. The smaller the selection of genes as a proportion of the whole, the greater the correction required. To ensure a False Coverage-statement Rate of q, we need (1-q*n_genes_selected/n_genes_total)*100%
confidence intervals.
all_results <- topTable(efit, n=Inf)
significant <- all_results$adj.P.Val <= 0.05
prop_significant <- mean(significant)
fcr_confint <- 1 - 0.05*prop_significant
all_results <- topTable(efit, confint=fcr_confint, n=Inf)
ggplot(all_results, aes(x=AveExpr, y=logFC)) +
geom_point(size=0.1, color="grey") +
geom_errorbar(data=all_results[significant,], aes(ymin=CI.L, ymax=CI.R), color="red") +
geom_point(data=all_results[significant,], size=0.1)
The FCR corrected CIs used here have the same q, 0.05, as we used as the cutoff for adj.P.Val. This means they never pass through zero.
I have some further thoughts on this topic, see the package topconfects
(http://logarithmic.net/topconfects/).