A site to help Biochemists learn R.

Starting points

Monday, 7 March 2016

Gene Expression Analysis and Visualization for VizBi 2016 (Pt 2)

Updated: I have updated this script after the VisBi Tutorial. I have added some comments which I hope explain things better.

This second part of the gene analysis script generates list of genes that are differentially expressed following drug treatment. The data set is the same as used in first part of the analysis script. It's on github and can be downloaded within R. I like the volcano plots which facilitate a nice understanding of the scale of changes caused by the different drugs.


# this script uses the limma pacakge from Bioconductor
# these first two lines install it. 
# for more information about the package, go here:
# https://bioconductor.org/packages/release/bioc/html/limma.html
# it was created to apply linear modelling to microarray data 
# but it can be used for any sets of numbers where you have multiple groups

x <- getURL("https://raw.githubusercontent.com/brennanpincardiff/RforBiochemists/master/data/microArrayData.tsv")
data <- read.table(text = x, header = TRUE, sep = "\t")

new.data <- data[2:16]
# these are log2 values 
# The data has been normalized and summarized

# turn into a matrix 
data.m <- as.matrix(new.data)

## calculate the significant differences limma!
names <- colnames(data.m)
names <- factor(gsub("\\.\\d", "", names)) # change names into factor

design <- model.matrix(~ 0 + names)
# this looks about right - each treatment has three samples
# no overlap between the samples. 

# Fit linear model for each gene given a series of arrays
fit <- lmFit(object = data.m, design = design)

# set up the experiment using makeContrasts
# makeContrasts
contrast <- makeContrasts(
  Drug_A = namesUT - namesDrug_A,
  Drug_B = namesUT - namesDrug_B,
  Drug_C = namesUT - namesDrug_C,
  Drug_D = namesUT - namesDrug_D,
  levels = design)

fit <- contrasts.fit(fit, contrast)

# Given a microarray linear model fit, compute moderated t-statistics, 
# moderated F-statistic, and log-odds of differential expression by 
# empirical Bayes moderation of the standard errors towards a common value.
fit.bayes <- eBayes(fit)

# using topTable to extract gene lists
# this method adjusts for multiple testing using the Benjamini and Hochberg method
# for information on this see the limma documentation
# https://bioconductor.org/packages/release/bioc/html/limma.html or
# there is lots of information on the internet
# e.g.: https://en.wikipedia.org/wiki/False_discovery_rate
Drug_A.diff <- topTable(fit.bayes, n=5000, adjust.method = "BH", coef=1)
Drug_B.diff <- topTable(fit.bayes, n=5000, adjust.method = "BH", coef=2)
Drug_C.diff <- topTable(fit.bayes, n=5000, adjust.method = "BH", coef=3)
Drug_D.diff <- topTable(fit.bayes, n=5000, adjust.method = "BH", coef=4)

# rownames of these are the row names of data...
str(Drug_D.diff)  # it's a data.frame


# we can draw a volcano plat because we have calculated the log 2 fold change and the p-value
Drug_A.diff$threshold = as.factor(Drug_A.diff$adj.P.Val < 0.01)
Drug_B.diff$threshold = as.factor(Drug_B.diff$adj.P.Val < 0.01)
Drug_C.diff$threshold = as.factor(Drug_C.diff$adj.P.Val < 0.01)
Drug_D.diff$threshold = as.factor(Drug_D.diff$adj.P.Val < 0.01)

##Construct the plot object
gA <- ggplot(data=Drug_A.diff, 
            aes(x=logFC, y =-log10(adj.P.Val), 
                colour=threshold)) +
  geom_point(alpha=0.4, size=1.75) +
  xlab("log2 fold change") + ylab("-log10 p-value") +
  theme_bw() +

gA <- gA + ggtitle("Drug A")
gA     # shows us the object - the graph

# this %+% allows us to push a different dataset into an existing 
# ggplot object. It overwrites the data
# it must contain the same column names otherwise it won't work.
gB <- gA %+% Drug_B.diff

# then we need to give it a new title
gB <- gB + ggtitle("Drug B")

# do the same for the other Drugs
gC <- gA %+% Drug_C.diff
gC <- gC + ggtitle("Drug C")
gD <- gA %+% Drug_D.diff
gD <- gD + ggtitle("Drug D")

# These visualisations work.
# no significant differences with Drug A, more with Drug B and C
# lots of differences with Drug D.

# run code below to make multiplot function first!
multiplot(gA, gB, gC, gD, cols=2)


# merge Drug_D adjusted P value and LogFC with expression values...
Drug_D.diff$probeset_id <- rownames(Drug_D.diff)
merged.data <- merge(data, Drug_D.diff, by="probeset_id")

# subset merged data for significantly altered genes
data.2.heatmap <- subset(merged.data, adj.P.Val < 0.001)

# get the numbers and turn into a matrix
data.2.heatmap.num <- data.2.heatmap[2:16]  # just the numbers
data.2.heatmap.m <- as.matrix(data.2.heatmap.num)

# if you make your plot window big, this will work
         RowSideLabs = FALSE, 
         main = "Heatmap of Drug Effects")

# otherwise print it to a PDF file. 
         RowSideLabs = FALSE, 
         main = "Heatmap of Drug Effects")

# It's not perfect but it's the best I can do at the moment.

# from: http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
# Multiple plot function
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  numPlots = length(plots)
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  if (numPlots==1) {
  } else {
    # Set up the page
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))

No comments:

Post a Comment

Comments and suggestions are welcome.