-
Notifications
You must be signed in to change notification settings - Fork 0
/
workflow.R
199 lines (163 loc) · 8.46 KB
/
workflow.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# At the top of each section, it was stated that which tutorial the codes were
# mainly from, specific modification would be comment on that line.
# invisible() funtion was used to suppress the system output, make the output more tidy
#===============================================================================
#import required libraries======================================================
#===============================================================================
library(limma)
library(affy)
library(gcrma)
library(annotate)
library(mouse4302.db)# load chip-specific annotation
library(scatterplot3d)
#BiocManager::install("arrayQualityMetrics")
library(arrayQualityMetrics)
options(warn=-1)# suppress warning
#===============================================================================
#QC of Raw Data=================================================================
#===============================================================================
##load the files for the workflow, this part of code is mainly from tutorial 2
# load the targets file and get affyBatch from it
targets <- read.table("targets.txt",header=T,as.is=T)
rawData <- ReadAffy(filenames=targets$Filename, sampleNames=targets$Name)
# create a directory for QC results
invisible(dir.create("QC_Results"))
# boxplot of raw data
png(filename = "./QC_Results/boxplot_raw.png")
par(mar=c(3,7,2,1))
# add xlabel, title, and change to horizontal
boxplot(rawData, cex.axis = 1, las = 1,
xlab = "Expression",
col = targets$Colour,
main = "Distribution of raw intensity values",
horizontal = T)
invisible(dev.off())
print("boxplot of raw data created -> ./QC_Results/boxplot_raw.png")
print("Running comprehensive QC analysis for the raw data!")
# modified from https://www.bioconductor.org/packages/release/bioc/vignettes/arrayQualityMetrics/inst/doc/arrayQualityMetrics.R
# comprehensive QC of the raw data
arrayQualityMetrics(expressionset = rawData,
outdir = "./QC_Results/Report_for_raw",
force = TRUE,
do.logtransform = TRUE)
print("Completed")
print("Result can be found in './QC_Results/Report_for_raw/index.html'")
#===============================================================================
#QC of Normalized Data==========================================================
#===============================================================================
# using gcRMA to normalize the raw data
print("Starting normalization by gcRMA")
# set fast to False, referred to https://www.bioconductor.org/packages/devel/bioc/manuals/gcrma/man/gcrma.pdf
eset <- gcrma(rawData, fast=F)
# boxplot of normalized data
png("./QC_Results/boxplot_normalized.png")
par(mar=c(3,7,2,1))
boxplot(eset, cex.axis = 1, las = 1,
xlab = "Expression",
col = targets$Colour,
main = "Distribution of normalized intensity values",
horizontal = T)
invisible(dev.off())
print("boxplot of raw data created -> ./QC_Results/boxplot_normalized.png")
# comprehensive QC for the normalized data
print("Running comprehensive QC analysis for the normalized data!")
arrayQualityMetrics(expressionset = eset,
outdir = "./QC_Results/Report_for_normalized",
force = TRUE)
print("Completed")
print("Result can be found in './QC_Results/Report_for_normalized/index.html'")
# get expression value matrix
exprsvals <- exprs(eset)
# 3d-PCA plot of the normalized data, from tutorial 3
pca <- prcomp(t(exprsvals), scale = T)
png("./QC_Results/3d-pca.png")
s3d <- scatterplot3d(pca$x[,1:3], pch = 19, color = targets$Colour,
main = "3D-PCA of Normalized Dataset")
s3d.coords <- s3d$xyz.convert(pca$x[,1:3])
text(s3d.coords$x, s3d.coords$y, labels = colnames(exprsvals),pos = 3,offset = 0.5)
invisible(dev.off())
print("The 3D-PCA plot of normalized data created -> ./QC_Results/3d-pca.png")
# mva plot, codes from tutorial 3
png("./QC_Results/MVA_plot.png")
mva.pairs(exprsvals, cex = 1, main = "MVA plot of normalized data")
invisible(dev.off())
#===============================================================================
#Fold Change Analysis===========================================================
#===============================================================================
## perform fold filtering, this part of code is mainly modified from tutorial 3
# gcRMA outputs log2 data while MAS5 outputs linear data
# To convert from log¦
exprsvals10 <- 2^exprsvals
# get the mean of infected samples
c2c12.inf <- apply(exprsvals10[,c("C2C12_Inf.1",
"C2C12_Inf.2",
"C2C12_Inf.3")],1,mean)
# get the mean of control samples
c2c12.ctrl <- apply(exprsvals10[,c("C2C12_Ctrl.1",
"C2C12_Ctrl.2",
"C2C12_Ctrl.3")],1,mean)
# calculate the fold
c2c12.inf_to_ctrl <- c2c12.inf / c2c12.ctrl
# combine all the data including log2-transformation
all.data = cbind(c2c12.ctrl, c2c12.inf, c2c12.inf_to_ctrl, abs(1 - c2c12.inf_to_ctrl))
colnames(all.data)[4] <- "abs(1-FC)"
# write the combined data to file
write.table(all.data, file="Fold_change.txt", quote=F, sep = '\t', col.names = NA)
print("Fold Change Result created -> ./Fold_change.txt")
#===============================================================================
#Statistical Analysis===========================================================
#===============================================================================
## Beginning statistical analysis, code mainly modified from tutorial 4
#build an annotation table
ID <- featureNames(eset)
Symbol <- getSYMBOL(ID, "mouse4302.db")
Name <- as.character(lookUp(ID, "mouse4302.db", "GENENAME"))
tmp <- data.frame(ID=ID, Symbol=Symbol, Name=Name, stringsAsFactors=F)
tmp[tmp=="NA"] <- NA #fix padding with NA characters
#assign as feature data of the current Eset
fData(eset) <- tmp
## Statistical analysis using Limma, code mainly modified from tutorial 4
# create design matrix
design <- model.matrix(~-1+factor(c(1,1,1,2,2,2)))
colnames(design) <- c("C2C12_Inf", "C2C12_Ctrl")
# create constratmatrix, where Limma comparisons to make
contrastmatrix <- makeContrasts(C2C12_Inf-C2C12_Ctrl,
levels=design)
# issue these commands to fit the model
# and make the contrasts
fit <- lmFit(eset, design)
fit2 <- contrasts.fit(fit, contrastmatrix)
# this last part essentially moderates the t-statistic using
# the borrowed variance approach described in class
fit2 <- eBayes(fit2)
# get the results
myresults <-topTable(fit2,coef=1, adjust="fdr", number=nrow(eset))
write.table(myresults,"differential.txt")
print("Statistical Differential Analysis Result created -> ./differential.txt")
#===============================================================================
#Enrichment Analysis============================================================
#===============================================================================
## Carry out Functional Enrichment analysis, code mainly modified from tutorial 5
# read annotation database
Mm.H <- readRDS("Mm.h.all.v7.1.entrez.rds")
## Process annotation for functional enrichment
# Here we select from the annotation a number of keys with the primary key being PROBEID
# due to the use of arrayQualityMetrics, use AnnotationDbi::select instead
# ref: https://support.bioconductor.org/p/101902/#101903
res <- AnnotationDbi::select(mouse4302.db, keys = rownames(eset),
columns = c("ENTREZID", "ENSEMBL","SYMBOL"),
keytype="PROBEID")
#find the index of each row of the expression set in the #annotation object res
idx <- match(rownames(eset), res$PROBEID)
#Use the index to set the phenotypic data in the ExpressionSet
fData(eset) <- res[idx, ]
#Find all rows that don't have an EntrezID and remove then
eset_t<-eset[is.na(fData(eset)$ENTREZID)==0,]
## Functional Enrichment Analysis, codes referred to Tutorial 5
#convert to indexes
H.indices <- ids2indices(Mm.H,fData(eset_t)$ENTREZID)
# run mroast for enrichmen analysis
results <-camera(eset_t,index=H.indices,design=design,contrast=contrastmatrix[,1],adjust.method = "BH")
# write the enrichment analysis results to file
write.table(results,"enrichment.txt",sep="\t")
print("Functional Enrichment Analysis result created -> ./enrichment.txt")