#
# FMDfunctions.R
# 
# This script contains the R functions that we will use to perform 
# our FMD simulations.
#
install.packages("calibrate")
install.packages("ROCR")
install.packages("mixtools")
install.packages("cluster")
install.packages("WGCNA")
install.packages("impute")
install.packages("superpc")
install.packages("mixtools")
source("http://bioconductor.org/biocLite.R") ; biocLite("impute")

# Required packages
require(gplots)
require(Matrix)
require(MASS)
require(superpc)
require("impute")
require(WGCNA)
require(cluster)
require(mixtools)
require("calibrate")
require("ROCR")
#===============================================================================================#
# 
# SimDatHubOld - Function to simulate data under the Hub models
#                   based on y = alphaHub*xHub + alphaInd*xInd + e where e~N(0,V.e)
#
#  {add documentation later}
#
# Arguments :
#      n  = total number of samples
#      pfh = the number of features in the one functional hub
#      pnh = the number of features in each non-functional hub
#      mnh = the number of non-functional hubs
#      pfi = the number of features that are functional independents
#      pni = the number of features that are nonfunctional independents
#         Note: So, the total number of features will be denoted by p where
#             p=pfh+mnh*pnh+pfi+pni
#     
#   alphaHub = correlation parameter between xhub values (see below) and Y
#   alphaInd = correlation parameter between x rows corresponding to the functional
#             independent features and y
#   betaFHub = correlation parameter  between x rows corresponding to the functional
#              hub and xHub
#   betaNFHub = correlation between x rows corresponding to the nonfunctional
#              hub and each of their respective hubs (see xNFHub below)
#              note: for the time being, betaNFHub is the same for all 
#                    non functional hubs
#     V.e = variance of background noise
# 
#
# Output : List object of class "FMDsimHub" containing
#      xHub = 1 x n vector of the values for the latent hub across samples
#      xNFHub=  mnh x n matric of values for the latent nonfunctional hubs
#      x = p x n data matrix (see below)
#      y = n x 1 vector
#      simpars= list of arguments given above 
#      featureID=px1 character vector with values ('fh','nh','fi','ni')
#
#  comments regarding the data matrix x:
#     the rows of x are ordered as follows:
#            functional hub features      1:pfh
#            non-functional hub features  (pfh+1):((pfh+1)+mnh*pnh)
#            functional independents      ((pfh+1)+mnh*pnh+1):(((pfh+1)+mnh*pnh+1)+pfi)
#            non-functional independents  (((pfh+1)+mnh*pnh+1)+pfi+1):p
#
#
#===============================================================================================#

SimDatHubOld=function(  n,  # total number of samples
                        pfh, # the number of features in the one functional hub
                        pnh, # the number of features in each non-functional hub
                        mnh, # the number of non-functional hubs
                        pfi, # the number of features that are functional independents
                        pni, # the number of features that are nonfunctional independents
                        #         Note: So, the total number of features will be denoted by p where
                        #             p=pfh+mnh*pnh+pfi+pni
                        alphaHub, # correlation parameter between xhub values (see below) and Y
                        alphaInd, # correlation parameter between x rows corresponding to the functional
                        #             independent features and y
                        betaFHub, # correlation parameter  between x rows corresponding to the functional
                        #              hub and xHub
                        betaNFHub, # correlation between x rows corresponding to the nonfunctional
                        #              hub and each of their respective hubs (see xNFHub below)
                        #              note: for the time being, betaNFHub is the same for all 
                        #                    non functional hubs
                        V.e,  # variance of background noise                         
                        vrb=F # a verbose flag to enable printing of intermediate steps
){
  
  # n=100;pfh=10;pnh=10;mnh=5;pfi=100;pni=20;alphaHub=1;alphaInd=1;betaFHub=.5;betaNFHub=0.5;V.e=10;vrb=T
  
  p=pfh+mnh*pnh+pfi+pni
  
  x=matrix(rnorm(n*p,0,1),p,n)
  xNFHub=matrix(NA,mnh,n)
  y=rep(NA,n)
  featureID=rep("",p)
  featureID[1:pfh]="fh"
  featureID[(pfh+1):((pfh+1)+mnh*pnh)]="nh"
  fi.index <- ((pfh+1)+mnh*pnh+1):(((pfh+1)+mnh*pnh+1)+pfi)
  featureID[fi.index]="fi"
  featureID[ (((pfh+1)+mnh*pnh+1)+pfi+1):p]="ni"
  
  # simulate functional hub data
  
  xHub=rnorm(n,0,1)
  for(i in 1:pfh) x[i,]=2*(rbinom(1,1,.5)-.5)*betaFHub*xHub+rnorm(n,0,1)
  
  # simulate nonfunctional hub data
  
  for(h in 1:mnh){
    xNFHub[h,]=rnorm(n,0,1)
    #  for(i in 1:pnh) cat((pfh+i+(h-1)*pnh),fill=T)   
    for(i in 1:pnh) x[(pfh+i+(h-1)*pnh),]=2*(rbinom(1,1,.5)-.5)*betaNFHub*xNFHub[h,]+rnorm(n,0,1)    
  }
  
  # generate y values based on relationships between functional genes and y
  # y = alphaHub*xHub + alphaInd*xInd + e where e~N(0,V.e)
  
  y <- alphaHub*xHub + colSums(alphaInd*x[fi.index,]) + rnorm(n,mean=0,sd=sqrt(V.e))
  
  
  simpars <- list(n,pfh,pnh,mnh,pfi,pni,alphaHub,alphaInd,betaFHub,betaNFHub,V.e)
  lists <- list(xHub = xHub, xNFHub = xNFHub, x = x, y = y, simpars = simpars,featureID=featureID) 
  
  class(lists) <- "FMDsimHub"
  return(lists)
}



#===============================================================================================#
# 
# get.par - Function to get relevant parameters to simulate data (passing to the function "SimDatHubOld")
#			based on rm, SNR, and alphahub
#                   
#  {add documentation later}
#
# Arguments :
#      rm  = correlation among nonhub genes in the functional hub, i.e., betaFHub^2/(1+betaFHub^2)
#      SNR = signal noise ratio, i.e., alphaHub/sqrt(V.e)
#      alphaHub = correlation parameter between xhub values (see below) and Y
# 
#
# Output : List object containing
#   	alphaHub = correlation parameter between xhub values (see below) and Y
#       betaFHub = correlation parameter  between x rows corresponding to the functional
#                   hub and xHub
#       V.e = variance of background noise
#       ri = correlation between nonhub genes in the functional hub and Y, 
#		i.e., alphaHub*betaFHub/sqrt((1+betaFHub^2)*(alphaHub^2+V.e))
#       rh = correlation between hub genes in the functional hub and Y
#		i.e., alphaHub/sqrt(alphaHub^2+V.e)
#
#===============================================================================================#

get.par<-function(rm,SNR,alphaHub)
{
	betaFHub<-sqrt(rm/(1-rm))
	sig.sq<-(alphaHub/SNR)^2
	rh<-alphaHub/sqrt(alphaHub^2+sig.sq)
	ri<-alphaHub*betaFHub/sqrt((1+betaFHub^2)*(alphaHub^2+sig.sq))
	return(list(alphaHub=alphaHub,betaFHub=betaFHub,V.e=sig.sq,ri=ri,rh=rh))
}



#===============================================================================================#
# 
# Wmatrix - Function to get W 
#    
#  {add documentation later}
#
# Arguments :
#       data: list of p x n data matrix x and n x 1 outcome vector y
#       delta: the exponent for the cor(x) matrix
#       gamma: the exponent for the cor(x,y) %*% t(cor(x,y)) matrix
#
# Output :    
#       W: p x p matrix where W = ( cor(x) )^delta * ( cor(x,y) %*% t(cor(x,y)) )^gamma
#
#===============================================================================================#

Wmatrix <- function(data,delta,gamma){

  x <- t(data$x)  # x: n x p matrix
  y <- data$y
  n <- nrow(x)
  left <- cor(x)
  diag(left) <- 0
  covs <- cor(x, y)
  right <- covs %*% t(covs)
  W <- ( (abs(left)^delta)*sign(left) ) * ( (abs(right)^gamma)*sign(right) )
  W<-apply(W, 2, function(x) {ifelse(x < 0, 0, x)})   ##make negative to zero
  return(W)  

}

#===============================================================================================#
# 
# WGC - Function to identify modules using the Horvath's WGCNA method assuming the following
#       coexpression similarity matrix: the absolute value of the correlation coefficient between genes
#       adjacency matrix: weighed network adjacency defined by raising the co-expression similarity 
#                         to a power of delta (delta=6 by rule of thumb)
#       dissimilarity matrix: 1- adjacency matrix
#       clustering method: Partitioning Around Medoids (pam)
#
#  {add documentation later}
#
# Arguments :
#       data: list of p x n data matrix x and n x 1 outcome vector y
#       delta: the exponent for the cor(x) matrix
#       ncluster: positive integer specifying the number of clusters, less than the number of observations.
#
# Output :    
#       u: clustering membership with 1 denote the functional hub genes and 0 otherwise
#
#===============================================================================================#

WGC <- function(data, delta, ncluster){
  
  # The following setting is important, do not omit.
  options(stringsAsFactors = FALSE) 
  
  # dissimilarity matrix
  dissADJ <- 1-abs(cor(t(data$x)))^delta
  
  pam <- pam(as.dist(dissADJ), ncluster)  

  # Relating modules and module eigengenes to the outcome y 
  # by calculating the average of absolute  correlations with y within modules
  ModuleSignificance <- tapply(abs(as.numeric(cor(data$y,t(data$x)))), pam$clustering, mean, na.rm=T)

  # define the clustering with the largest value to be the functional hub
  u <- (pam$clustering==which.max(ModuleSignificance))*1	

  return (u)
}

#===============================================================================================#
# 
# Km - Function to identify modules using K-Means Clustering
#
#  {add documentation later}
#
# Arguments :
#       data: list of p x n data matrix x and n x 1 outcome vector y
#       ncluster: positive integer specifying the number of clusters, less than the number of observations.
#
# Output :    
#       u: clustering membership with 1 denote the functional hub genes and 0 otherwise
#
#===============================================================================================#

Km <- function(data, ncluster){
  
  k.fit <- kmeans(x=data$x,centers=ncluster,iter.max=50,nstart=25)
   
  # Relating modules and module eigengenes to the outcome y 
  # by calculating the average of absolute  correlations with y within modules
  ModuleSignificance <- tapply(abs(as.numeric(cor(data$y,t(data$x)))), k.fit$cluster, mean, na.rm=T)
 
  # define the clustering with the largest value to be the functional hub
  u <- (k.fit$cluster==which.max(ModuleSignificance))*1
  return(u)
}
  

#===============================================================================================#
# 
# WK - Function to identify modules by applying K-means (K=2) clustering to the first eigenvector of W matrix
#
#  {add documentation later}
#
# Arguments :
#       data: list of p x n data matrix x and n x 1 outcome vector y
#       delta: the delta parameter passed to "Wmatrix" function, i.e., the exponent for the cor(x) matrix 
#       gamma: the gamma parameter passed to "Wmatrix" function, i.e.,the exponent for the cor(x,y) %*% t(cor(x,y)) matrix
#
# Output :    
#       u: clustering membership with 1 denote the functional hub genes and 0 otherwise
#
#===============================================================================================#

WK <- function(data,delta,gamma){
  
  # generate W matrix
  W=Wmatrix(data,delta,gamma)
  
  # the first eigenvector of W
  eigenvec=eigen(W)$vectors[,1]
  
  # K-means clustering with K=2
  k.fit=kmeans(eigenvec,2,iter.max=50,nstart=25)

  # Relating modules and module eigengenes to the outcome y 
  # by calculating the average of absolute  correlations with y within modules
  ModuleSignificance <- tapply(abs(as.numeric(cor(data$y,t(data$x)))), k.fit$cluster, mean, na.rm=T)
  
  # define the clustering with the largest value to be the functional hub
  u<-(k.fit$cluster==which.max(ModuleSignificance))*1

  return (u)
}




#===============================================================================================#
# 
# bt.func - Function to bootstrap the sample 
#		and to get the corresponding first eigenvector of Wmatrix based on the boostrapped sample
#
#  {add documentation later}
#
# Arguments :
#       data: list of p x n data matrix x and n x 1 outcome vector y
#       delta: the delta parameter passed to "Wmatrix" function, i.e., the exponent for the cor(x) matrix 
#       gamma: the gamma parameter passed to "Wmatrix" function, i.e.,the exponent for the cor(x,y) %*% t(cor(x,y)) matrix
#
#
# Output :    
#       e: the corresponding first eigenvector of Wmatrix based on boostrapped samples
#
#===============================================================================================#


bt.func <- function(data,delta,gamma){  # data$x: p*n
	bt <- sample(1:dim(data$x)[2],replace = TRUE)
	datanew <- list(x=data$x[,bt],y=data$y[bt])
  Wnew <- Wmatrix(datanew,delta=delta,gamma=gamma)
	e <- eigen(Wnew)$vectors[,1]
	return(e)
}


#===============================================================================================#
# 
# WK.bt - Function to identify modules by applying K-means (K=2) clustering to 
#		the mean of first eigenvector of W matrix based on boostrapped samples
#
#  {add documentation later}
#
# Arguments :
#       data: list of p x n data matrix x and n x 1 outcome vector y
#       delta: the delta parameter passed to "Wmatrix" function, i.e., the exponent for the cor(x) matrix 
#       gamma: the gamma parameter passed to "Wmatrix" function, i.e.,the exponent for the cor(x,y) %*% t(cor(x,y)) matrix
#
# Output :    
#       u: clustering membership with 1 denote the functional hub genes and 0 otherwise
#
#===============================================================================================#

WK.bt <- function(data,delta,gamma){
  
  # the number of bootstrapped samples
  Nbp <- 500  

  # the first eigenvector of W based on Nbp boostrapped samples
  eigen.bt <- replicate(Nbp,bt.func(data=data,delta=delta,gamma=gamma))

  # applying K-means (K=2) clustering to the mean of first eigenvector of W matrix based on boostrapped samples
  k.fit <- kmeans(apply(eigen.bt,1,mean),2,iter.max=100,nstart=25)

  # Relating modules and module eigengenes to the outcome y 
  # by calculating the average of absolute  correlations with y within modules
  ModuleSignificance<-tapply(abs(as.numeric(cor(data$y,t(data$x)))), k.fit$cluster, mean, na.rm=T)

  # define the clustering with the largest value to be the functional hub
  u<-(k.fit$cluster==which.max(ModuleSignificance))*1  

  return (u)
}


#===============================================================================================#
# 
# spc - Function to identify modules using Tibshirani's supervised principal components method a
#
#  {add documentation later}
#
# Arguments :
#       dat: list of p x n data matrix x and n x 1 outcome vector y
#
# Output :    
#       u: clustering membership with 1 denote the functional genes and 0 otherwise
#
#===============================================================================================#

spc <- function(data){
  
  # supervised principal components
  train.obj <- superpc.train(data, type="regression")
  
  # cross-validation to estimate the optimal feature threshold in supervised principal components
  # functions in the "superpc" package : big variability of thresholds
  # Sometimes it gives an error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :  NA/NaN/Inf in 'x'
  # and  lots of warnings
  cv.obj <- superpc.cv(train.obj, data)
  th<-cv.obj$thresholds[which.max(cv.obj$scor[1,])]  ## thresholds for the 1st component
  u <- (abs(train.obj$feature.scores) > th)*1
  
  return (u)
}



#===============================================================================================#
# 
# WK.W - Function to identify modules by applying K-means clustering to W matrix
#
#  {add documentation later}
#
# Arguments :
#       data: list of p x n data matrix x and n x 1 outcome vector y
#       delta: the delta parameter passed to "Wmatrix" function, i.e., the exponent for the cor(x) matrix 
#       gamma: the gamma parameter passed to "Wmatrix" function, i.e.,the exponent for the cor(x,y) %*% t(cor(x,y)) matrix
#       W: p x p matrix derived from "Wmatrix" function
#       ncluster: positive integer specifying the number of clusters, less than the number of observations.
#
#
# Output :    
#       u: clustering membership with 1 denote the functional hub genes and 0 otherwise
#
#===============================================================================================#

WK.W <- function(data,delta,gamma,ncluster){
  
  # generate W matrix
  W=Wmatrix(data,delta,gamma)
  
  # K-means clustering based on W matrix
  k.fit=kmeans(W,ncluster,iter.max=50,nstart=25)

  # Relating modules and module eigengenes to the outcome y 
  # by calculating the average of absolute  correlations with y within modules
  ModuleSignificance <- tapply(abs(as.numeric(cor(data$y,t(data$x)))), k.fit$cluster, mean, na.rm=T)
  
  # define the clustering with the largest value to be the functional hub
  u<-(k.fit$cluster==which.max(ModuleSignificance))*1

  return (u)
}



#===============================================================================================#
# 
# perf - Function to evaluate the performance of module identified
#
#  {add documentation later}
#
# Arguments :
#       u: clustering membership with 1 denote the functional genes and 0 otherwise 
#          returned from functions ("WGC", "Km", "WK" or "spc")
#       truth:  the true clustering membership with 1 denote the functional hub genes and 0 otherwise 
#
# Output :  a vector containing
#       FPR: false positive rate
#       TPR: true positive rate
#       FDR: false discovery rate
#===============================================================================================#

perf <- function(u, truth){
  
  pred <- prediction( u, truth)
  perf1 <- performance(pred, "tpr", "fpr")
  FPR <- unlist(perf1@x.values)[2]
  TPR <- unlist(perf1@y.values)[2]
  
  # FDR=1/(1 + a*TPR/FPR)
  nFH <- sum(truth)
  a <- nFH/(length(truth)-nFH)
  FDR <- 1/(1 + a*TPR/FPR)
  
  table <- c(FPR,TPR,FDR)
  names(table) <- c("FPR","TPR","FDR")
  
  return(table)
}



