A site to help Biochemists learn R.

Starting points

Wednesday, 24 June 2015

Microarray files and script...

Please try to download the data from here:
http://science2therapy.com/data/RAWdata.txt.zip

and here:
http://science2therapy.com/data/REDUCEDdata.txt.zip

Here is the script:

# Readme: note 1 - packages
# please refer to the accompanying Word document

#source("http://bioconductor.org/biocLite.R")
#biocLite("oligo")
#biocLite("genefilter")

# Now to "switch on" the oligo and genefilter packages, using the library command
library(oligo)
library(genefilter)

# Readme: note 2 - parameters
par(ask=T)

# Readme: note 3 - working directories

setwd('/Users/Stone/Desktop/RforBiol/GSE56899_Reduced/')
setwd('/Users/Stone/Desktop/RforBiol/GSE56899_RAW')

# dir() lists all the files in the working directory, this is the input for the 
# read.celfiles() command
celfiles <- read.celfiles(dir())

# Readme: note 4 - classes and objects
class(celfiles)

# How many rows of data are these raw datafiles?
nrow(exprs(celfiles))

# We can boxplot these data, but the command will take a long time, so I have deactivated it
# boxplot(dat)

# Readme: note 5 - probes, probesets and genes
dat <- rma(celfiles)
dat <- exprs(dat)
# The data is now normalized and summarized
# Technical replicates on the array are now averaged to a single probeset value

# Now to inspect the data
boxplot(dat)

# I can cluster the data by array, and plot it
# I first need to find the "distances" between the arrays
# and show those distances in the hierarchical cluster plot
# I can pile up all the commands in one
plot(hclust(dist(t(dat))))

# Write a pdf version as a more permanent version
setwd('/Users/Stone/Desktop/RforBiol/')
pdf("dData Clusterplot.pdf")
plot(hclust(dist(t(dat))))
dev.off()

# Finally I have Principal Component Analysis which is a very powerful 
# and widely-used technique for viewing patterns of gross variation
# in datasets
# We can see whether an array is substantially different to the others
plot(princomp(dat)$loadings)
outliers <- identify(princomp(dat)$loadings)

if (length(outliers) > 0) {
  cat("The outliers are: ", colnames(dat)[outliers], sep="\n")
} else {
  cat ("No outliers were selected")
}  
  
# Readme: note 6 - regular expressions
# Phenotypic information is contained 

nonpreg <- grep("Non_Preg", colnames(dat))
preg <- (1:ncol(dat))[-nonpreg]

preglabel <- rep("P", ncol(dat))
preglabel[nonpreg] <- "N"
plot(hclust(dist(t(dat))), labels=preglabel)

preg.number <- gsub(".*P(\\d+)_.*", "\\1", colnames(dat[,preg]))
nonpreg.number <- gsub(".*Non_Preg_(\\d+).*", "\\1", colnames(dat[,nonpreg]))

trimester <- gsub(".*_T(\\d+).*", "\\1", colnames(dat[,preg]))

# Readme: note 7 - coercion
trimester <- as.numeric(trimester)

# PCA plot of only the pregnant samples, by trimester
plot(princomp(dat[,preg])$loadings, col=trimester, pch=20, cex=2)

# Readme: note 8 - factors
pstatus <- rep("FALSE", times=ncol(dat))
pstatus[preg] <- "TRUE"
pstatus <- factor(pstatus)

plot(princomp(dat)$loadings, pch=20, cex=2, col=pstatus)

###############################
#        4 Ways to do exactly the same thing
#        l33297 row-by-row t.tests
#        in increasing orders of sophistication
###############################

# Method 1 - NO!
t.test(dat[1,preg], dat[1, nonpreg])$p.value
t.test(dat[2,preg], dat[2, nonpreg])$p.value
# etc etc
# this is madness

# Method 2 use a loop
pvals <- rep(0, nrow(dat))
for (i in 1:nrow(dat)) {
  pvals[i] <- t.test(dat[i, preg], dat[i, nonpreg])$p.value
}

# Method 3 the apply function

  # First to introduce the apply function
  # Here is apply using an off-the-shelf function, mean()
  # We calculate the means of everyrow

  apply(dat, 1, mean)

  # Every column mean
  apply(dat, 2, mean)
  plot(apply(dat, 2, mean))

# Now the p-value calculation using apply
# Although the t.test() function is off-the-shelf
# We want to extract the p-value, so we need to write our own function

pvals <- apply(dat, 1, function(x) { t.test(dat[x,preg], dat[x,nonpreg])$p.value }  ) 

# Method 4 the package way
t.preg <- rowttests(dat, pstatus)

# Let's take a look at the top of this
head(t.preg)

# Readme: note 9 - Multiple-testing adjustment
adj.p <- p.adjust(t.preg$p.value, method="BH")
t.preg <- cbind(t.preg, adj.p)

# We can have a look at our p-values as a histogram
hist(t.preg$adj.p)

# Or get a summary, summary() is a very useful "first look" command
summary(t.preg$adj.p)

# How many genes have an adjusted p-value of less that 5%?
sum(t.preg$adj.p < 0.05)

# What rows (probesets) are significant?
sig.preg <- which(t.preg$adj.p < 0.05)
sig.preg.dat <- dat[sig.preg, ]

# How about a lovely heatmap of the data
heatmap(sig.preg.dat)

# What are the probeset id's of the significant genes?
sig.preg.probes <- rownames(dat)[sig.preg]

# Let's write these to a text file
write.table(sig.preg.probes, '/Users/Stone/Desktop/RforBiol/SigPregProbest', 
            quote=F, row.names=F, col.names=F)

preg.dat <- dat[,preg]

# I perform a 3-group analysis (F-test) on the trimester data
# I'm using some shortcuts, I'm not bothering to make a separate factor for trimester
t.trim <- rowFtests(preg.dat, as.factor(trimester))

# Another short-cut, I'm doing the multiple correction and sending it straight to the 
# summary() command, so that I can immediately see whether there is anything signifacnt 
# or not
summary(p.adjust(t.trim$p.value, method="BH"))
# The lowest adjusted p-value is 0.9304

#######  Complicated stuff

# Using the apply function to remove a co-variate

# Let's make a pretend variable
smoking <- sample(x=c(T, F), ncol(dat), replace=T, prob = c(0.25, 0.75))
drinking <- sample(x=c(T, F), ncol(dat), replace=T, prob = c(0.6, 0.4))
obese <- sample(x=c(T, F), ncol(dat), replace=T, prob = c(0.3, 0.7))


#no.smoke <- apply(dat, 1, function(x) { lm(x ~ smoking )$resid})
#no.drink <- apply(dat, 1, function(x) { lm(x ~ drinking )$resid})
#no.anything <- apply(dat, 1, function(x) { lm(x ~ smoking + drinking + obese)$resid})




#############
#
# This is the "fancy" way for serious microarray people
# Using the limma package
# I am not going to describe it in any detail
# But you fit a linear model to each gene
#

design <- model.matrix(~ 0 + pstatus )
colnames(design) <- c("preg", "nonpreg")
fit <- lmFit(dat, design)
fit <- eBayes(fit)

contrasts <- makeContrasts(preg - nonpreg, levels=design)
fit2 <- contrasts.fit(fit, contrasts)
fit2 <- eBayes(fit2)

preg.genes <- topTable(fit2, n=Inf)
sum(preg.genes$adj.P.Val < 0.05)


No comments:

Post a Comment

Comments and suggestions are welcome.