############################################################
#This code for the quality control moran and zhang dataset
#By Ahmed Hemedan
################################################################
# Quality control

# Create quality report for the first dataset
minimalSet <- ExpressionSet(assayData=as.matrix(zhangfilt))
arrayQualityMetrics(expressionset = minimalSet, outdir = "Quality_Report_Zhang", force = TRUE, do.logtransform = FALSE)


# Create quality report for the second dataset
minimalSet <- ExpressionSet(assayData=as.matrix(moranfilt))
arrayQualityMetrics(expressionset = minimalSet, outdir = "Quality_Report_Moran", force = TRUE, do.logtransform = FALSE)


# remove all samples failing at least two quality tests

remove_samples_zhang = match(c("GSM606624","GSM606625","GSM606626"),colnames(zhangfilt))
zhangfilt2 = zhangfilt[,-remove_samples_zhang]
zhang_outcome_final = zhang_outcomefilt[-remove_samples_zhang]

# Moran dataset: no failing samples to remove
moran_outcome_final = moran_outcomefilt


# Manual outlier check

# detect outliers via hierarchical clustering

distmat = dist(t(zhangfilt2))

hcldat = hclust(distmat, method="average")

plot(hcldat)

# detect outliers via PCoA

medianscale <- cmdscale(dist(t(zhangfilt2)), k = 2)

plot(medianscale[,1], medianscale[,2], col=rainbow(2)[match(zhang_outcome_final, unique(zhang_outcome_final))], pch=20, main="PCoA plot", labels=NULL, cex=2, cex.axis=0.1, tck=0, xlab="Dimension 1", ylab="Dimension 2")

distmat = dist(t(moranfilt))

hcldat = hclust(distmat, method="average")

plot(hcldat)

# detect outliers via PCoA

medianscale <- cmdscale(dist(t(moranfilt)), k = 2)

plot(medianscale[,1], medianscale[,2], col=rainbow(2)[match(moran_outcome_final, unique(moran_outcome_final))], pch=20, main="PCoA plot", labels=NULL, cex=2, cex.axis=0.1, tck=0, xlab="Dimension 1", ylab="Dimension 2")


# Data transformation using Variance stabilising normalization (VSN)

# check for intensity-dependent variance
meanSdPlot(as.matrix(zhangfilt2))

zhangvsn = exprs(vsn2(as.matrix(zhangfilt2)))

# verify the fit
meanSdPlot(zhangvsn)


# check for intensity-dependent variance: Moran dataset
meanSdPlot(as.matrix(moranfilt))
# yes, variance dependence on average intensity -- apply VSN transformation

moranvsn = exprs(vsn2(as.matrix(moranfilt)))
moranvsn 
# verify the fit
meanSdPlot(moranvsn)

# Power calculation

# outcome "y" for two unpaired classes must be numeric labels 1, 2
data = list(x=zhangvsn, y=ifelse(zhang_outcome_final=="disease state: Control",1,2), geneid=as.character(1:nrow(zhangvsn)),genenames=paste("g",as.character(1:nrow(zhangvsn)),sep=""), logged2=TRUE)


# run the simulation with 1000 permutations
samr.obj <- samr(data,  resp.type="Two class unpaired", nperms=1000, random.seed=1234)


# investigate the following sample sizes of interest: 10, 20, 30, 50
colnum = ncol(data$x)
sfactors = c(10/colnum, 20/colnum, 30/colnum, 50/colnum)

# set seed value for the random number generator
set.seed(1234)


# determine power to detect 1.5-fold changes
samr.assess15 <- samr.assess.samplesize(samr.obj, data, log2(1.5), samplesize.factors=sfactors)
samr.assess.samplesize.plot(samr.assess15)

# determine power to detect 1.1-fold changes
samr.assess11 <- samr.assess.samplesize(samr.obj, data, log2(1.1), samplesize.factors=sfactors)
samr.assess.samplesize.plot(samr.assess11)


# Moran data - power calculation


# outcome "y" for two unpaired classes must be numeric labels 1, 2
data = list(x=moranvsn, y=ifelse(moran_outcome_final=="control",1,2), geneid=as.character(1:nrow(zhangvsn)),genenames=paste("g",as.character(1:nrow(zhangvsn)),sep=""), logged2=TRUE)


# run the simulation with 1000 permutations
samr.obj <- samr(data,  resp.type="Two class unpaired", nperms=1000, random.seed=1234)


# investigate the following sample sizes of interest: 10, 20, 30, 50
colnum = ncol(data$x)
sfactors = c(10/colnum, 20/colnum, 30/colnum, 50/colnum)

# set seed value for the random number generator
set.seed(1234)


# determine power to detect 1.5-fold changes
samr.assess15 <- samr.assess.samplesize(samr.obj, data, log2(1.5), samplesize.factors=sfactors)
samr.assess.samplesize.plot(samr.assess15)

# determine power to detect 1.1-fold changes
samr.assess11 <- samr.assess.samplesize(samr.obj, data, log2(1.1), samplesize.factors=sfactors)
samr.assess.samplesize.plot(samr.assess11)
zhang_outcome_final

# DEG Analysis of individual datasets

# Limma analysis of Zhang dataset
zhang_label = ifelse(zhang_outcome_final == "disease state: Control","control","parkinson")
design <- model.matrix(~ -1+factor(zhang_label))
design
colnames(design) <- unique(zhang_label)
colnames(design)

# compute simple linear model fit to microarray data 
fit <- lmFit(zhangvsn, design)
contrast.matrix = makeContrasts(parkinson-control, levels=design)
fit2 = contrasts.fit(fit, contrast.matrix)
eb <- eBayes(fit2)

# extract the ranking table and show the top-ranked genes
ttable_zhang <- topTable(eb, n = nrow(zhangfilt2)) 
head(ttable_zhang)


# Limma analysis of Moran dataset
design <- model.matrix(~ -1+factor(moran_outcome_final))
design
colnames(design) <- unique(moran_outcome_final)

# compute simple linear model fit to microarray data 
fit <- lmFit(moranvsn, design)

contrast.matrix = makeContrasts(parkinson-control, levels=design)
fit2 = contrasts.fit(fit, contrast.matrix)

eb <- eBayes(fit2)

# extract the ranking table and show the top-ranked genes
ttable_moran <- topTable(eb, n = nrow(moranfilt)) 

head(ttable_moran)
moran_outcome_final

#save datasets
save(moranvsn, moran_outcome_final, file="moran_preprocessed.Rdata")
save(zhangvsn, zhang_outcome_final, file="zhang_preprocessed.Rdata")
