# #this reads the file names from the xlsx sheet
# 
#   meta.data<-read.xlsx("data/GSEA_Dataset_metadata v2.xlsx",sheetName="Datasets.new")
# 
# #meta.data<-read.xlsx("data/developement.temp.xlsx",sheetName="Datasets.new")
#  
#  
# # meta.data<-read.csv("data/GSEA_Dataset_metadata_v2.csv")
# ind.ExprSet<-grep("ExprSet",meta.data$Type)
# meta.data<-meta.data[ind.ExprSet,c(4:6,12)]
# esets.names<-gsub(".Expr.txt","",meta.data[,2])
# #the processed ES data.frames of same names are stored localy in data/ES_Sets
# 
# 
signature_info<-read.csv("data/Signatures_Refs.csv")

load("data/esets.r")



load("data/exprsets.r")
#load signatures
load("data/gsc.sets.r")

 
 
#read user uploaded signatures
read_signatures <- function(filename){
  
  gsc.file <- scan(filename,what = "", sep="\n")
  gsclist<- strsplit(gsc.file, "[[:space:]]+")
  names(gsclist) <- sapply(gsclist, `[[`, 1)
  gsclist <- lapply(gsclist, `[`, -1)
  uniqueList <- lapply(gsclist, unique)
  makeSet <- function(geneIds, n) {GeneSet(geneIds, geneIdType=SymbolIdentifier(), setName=n)}
  gsclist <- mapply(makeSet, uniqueList[], names(gsclist))
  gsc <- GeneSetCollection(gsclist)
  return(gsc)
}
  
  
applyGSVA <- function(edata,geneset,var_cutoff){
  #temporary addittion - to deal with the sommalogic data
  #converts gsc to a list
 
#   if (identical(annotation(edata),character(0))){
#   lgsc<-list()
#   for (i in 1:length(names(geneset))) lgsc[i]<-geneIds(geneset[i])
#   names(lgsc)<-names(geneset)
#   geneset<-lgsc
#   }
  
  ####temp filtering-remove and use line 61
#   filtered_eset <- nsFilter(edata, var.filter=FALSE,remove.dupEntrez=TRUE,
# #                             require.entrez=TRUE, remove.dupEntrez=TRUE,
# #                                  var.func=IQR, var.filter=TRUE, var.cutoff=0.5, filterByQuantile=TRUE,
#                                  feature.exclude="^AFFX")
#   filtered_eset<-filtered_eset$eset
#   es<-gsva(filtered_eset, geneset, min.sz=3, max.sz=1000,abs.ranking=FALSE, verbose=TRUE)$es.obs
#   #
 
  #es<-tryCatch(gsva(edata, geneset, min.sz=1, max.sz=1000,abs.ranking=FALSE,mx.diff=TRUE)$es.obs),
  #            error = function(e) {print("gene set collection bad format");
   #                                NULL})
  
  
  
  edata<-nsFilter(edata, require.entrez=FALSE, remove.dupEntrez=FALSE, 
           var.func=IQR, var.filter=FALSE, 
           var.cutoff=var_cutoff, filterByQuantile=TRUE, feature.exclude="^AFFX")$eset
 
  
  a<-tryCatch(gsva(edata, geneset, min.sz=1, max.sz=4000,abs.ranking=FALSE,mx.diff=TRUE)$es.obs,
              error = function(e) {print("gene set collection bad format");
                                   NULL})
  
  
  
  #es<-gsva(edata, geneset, min.sz=1, max.sz=1000,abs.ranking=FALSE,mx.diff=TRUE)$es.obs    
}




formatesdf <- function(esdf,setname){
  
  # store the numer of signatures
  sig_num <- nrow(exprs(esdf))+1 
  # convert to data frame
  esdf<-data.frame(esdf)
  esdf<-cbind(rownames(esdf),esdf)
  rownames(esdf)<-1:nrow(esdf)
  

  
  # Identify columns that do not hold identical entries (colind1) 
  # (cannot be experimental condition)
  
    colind<-numeric()

    for (i in (sig_num+1):ncol(esdf)){
      
      if(length(unique(esdf[,i]))!=1)
      colind[length(colind)+1]=i   
  }
  
  #some experiments don't hold varying conditions, check
  if (length(colind)>0) #remove columns only if there are any left
  esdf <- esdf[,c(1:sig_num,colind)]
  
}


showboxplot<-function(indata, inx, iny,colcol) {
 
  if(!is.numeric(indata[,iny])) return()
  
  condition <- indata[,inx]
  color_cond <- indata[,colcol]
  colour <- factor(color_cond)
  
                                        

  
  p <- ggplot(indata, aes(condition, y=indata[,iny]), environment = environment()) +
       geom_boxplot() + 
       geom_jitter(aes(colour=colour),size=3.5,
                   position = position_jitter(width = 0.2),show_guide = TRUE ) +
       ylab(colnames(indata)[iny]) + xlab(colnames(indata)[inx]) + 
#     ylim(round(min(indata[,1])-0.1,1), round(max(indata[,1])+0.1,1))+
    theme(axis.text.x  = element_text(size=11), legend.position="top")+
#     annotate("text", y= (max(indata[iny])+0.06), 
#              x =((length(unique(condition))/2)+0.5),
#              size=7,colour="aquamarine4",
#              label=colnames(indata)[iny],fontface="bold")+
     labs(x = NULL) +    labs(y = "Enrichment score") +
  labs(title = colnames(indata)[iny])
  
  print(p)
}


showplot<-function(indata,x,y,z) {
  
    cohort <- indata[,z]
    nsamples<- paste("N=",nrow(indata),",\n",sep = "")
    r<-paste("r =",round(cor(indata[,x],indata[,y],use="pairwise.complete.obs"),digits=3),",\n",sep="")
    pval<-paste("p-value=",round(cor.test(indata[,x],indata[,y])$p.value,digits=3),"\n",sep="")
    fig_text<-paste(nsamples,r,pval)
    

  
  
  p <-ggplot(indata,aes(indata[,x],indata[,y]),environment = environment())+
  stat_smooth(method="lm",se=FALSE) + #aes(shape = factor(cohort))+
  geom_point(aes(colour = factor(cohort)),size = 4)+
  ylab(colnames(indata)[y]) + xlab(colnames(indata)[x]) + labs(title = fig_text)
   
 print(p)
 
}


normalise2control<-function(dataset,control_group,cohort_group){
  cotrol_pos<-match(cohort_group,colnames(dataset))
  #estimate the mean in the data column, for the chosen group_group
  controlmeans<-mean(dataset[dataset[,cotrol_pos]==control_group,1])
  #subtract mean from entire data column 
  dataset[,1]<-dataset[,1]-controlmeans

  return(dataset) 
}



#######to do the stats - TO DO (incomplete)
statistics<-function(eset,rn,adjp){

  
emove NA's
rn<-rn[!is.na(eset[,ncol(eset)])]
eset<-eset[!is.na(eset[,ncol(eset)]),]


groups.list<-list()  
groups<-unique(eset[,ncol(eset)]) #these are the groups, if more than 5 return "TOO MANY GROUPS"

if (length(groups)>8) return("TOO MANY GROUPS")


  for (i in 1:(length(groups)-1)){
    
    for (j in (i+1):length(groups)){

  
  
  group1<-as.character(groups[i])
  group2<-as.character(groups[j])
  ind1<-eset[,ncol(eset)]==group1
  ind2<-eset[,ncol(eset)]==group2
  ind<-as.logical(as.numeric(ind1)+as.numeric(ind2))
  ex<-as.matrix(eset[ind,(1:ncol(eset)-1)])
  rownames(ex)<-rn[ind]
  
  pd<-as.data.frame(eset[ind,ncol(eset)])
  rownames(pd)<-rn[ind]
  phenoData <- new("AnnotatedDataFrame",data=pd)
  ex<-t(ex)
  eset_es<-ExpressionSet(assayData=ex,phenoData=phenoData)
  names(pData(eset_es))<-"cohort"
  
  #do the analysis
  
  adjPvalueCutoff <- adjp
 
  
  design <- model.matrix(~ factor(eset_es$cohort))
   colnames(design) <- c(group1, paste(group1, "VS", group2))
   fit <- lmFit(eset_es, design)
   fit <- eBayes(fit)
  # allGeneSets <- topTable(fit, coef=paste(group1, "VS", group2), number=Inf)
   DEgeneSets <- topTable(fit, coef=paste(group1, "VS", group2), number=Inf,p.value=adjPvalueCutoff, adjust="BH")
  # res <- decideTests(fit, p.value=adjPvalueCutoff)
  # summary(res)
  
   if (nrow(DEgeneSets)>0){
#     DEgeneSets<-rbind(DEgeneSets,rep("",1,ncol(DEgeneSets)))
#     rownames(DEgeneSets)[nrow(DEgeneSets)]<-""
   groups.list[[paste(group1, "VS", group2)]]<-DEgeneSets
   groups.list[[paste(group1, "VS", group2)]]<-cbind(rownames(groups.list[[paste(group1, "VS", group2)]]),groups.list[[paste(group1, "VS", group2)]])
   colnames(groups.list[[paste(group1, "VS", group2)]])[1]<-"Gene Signature"
   
#   
   }

    }
  
  }

#groups.list<-do.call("rbind", groups.list)
groups.list <- ldply(groups.list, data.frame)
colnames(groups.list)[1]<-"Groups"
#groups.list<-c(groups.list)

 # if (nrow(DEgeneSets)==0) return("Too Many Groups")else return(groups.list)  
   if (length(groups.list)==0) return("No deregulated pathways")else return(groups.list)

# ###alterntive code option
# design <- model.matrix(~0+factor(pData(eset)))
# colnames(design) <- c("RNA1","RNA2","RNA3") #should be as members of pData(eset)
# To make all pair-wise comparisons between the three groups one could proceed
# fit <- lmFit(eset, design)
# contrast.matrix <- makeContrasts(RNA2-RNA1, RNA3-RNA2, RNA3-RNA1,levels=design) # here a loop would be needed to generate
##all combs
# fit2 <- contrasts.fit(fit, contrast.matrix)
# fit2 <- eBayes(fit2)
# A list of top genes for RNA2 versus RNA1 can be obtained from
# topTable(fit2, coef=1, number=Inf,p.value=adjPvalueCutoff, adjust="BH")  #coef can go from 1 to number of pair combs
# ####end of alternative code option


}
####### end of stats
