LRpath<-function(sigvals, geneids, min.g=10, max.g=NA, sig.cutoff=0.05,
	database="GO", odds.min.max=c(0.001,0.5), species) {

###########################################################################################
#  Function for LRpath: testing GO terms or KEGG with logistic regression
#  Written by: Maureen Sartor, University of Cincinnati, 2008
###########################################################################################
##
##  This function uses logistic regression to test for enriched biological
##  categories in gene expression data. Our method models the probability 
##  of a randomly selected gene belonging to a specific category given the
##  significance level of that gene.  For categories significantly affected by
##  the experimental condition, this probability will increase as the significance
##  statistic increases.  Categories with significant p-values and positive slope 
##  coefficients are enriched with differentially expressed genes. 
##
##  Please acknowledge your use of LRpath in publications by referencing:
##  Sartor MA, Leikauf GD, and Medvedovic M.  LRpath: a logistic regression approach for
##  identifying enriched biological groups in gene expression data. (submitted 
##  to Bioinformatics), 2008.
##
##  Inputs:
##  8 parameters: sigvals, geneids, min.g, max.g, sig.cutoff, database, odds.min.max, species
##  "sigvals" a vector of p-values, same length and order as "geneids"
##  "geneids" a vector of Entrez gene IDs, may contain duplicates and missing values.
##  "min.g"  The minimum number of unique gene IDs analyzed in category to be tested
##		default = 10
##  "max.g"  The maximum number of unique gene IDs analyzed in category to be tested
##		default = NA (99999)
##  "sig.cutoff" Entrez gene IDs in each category with p-values<sig.cutoff will be returned
##		default = 0.05
##  "database" database to be tested- currently, either "GO" or "KEGG"
##		default = "GO"
##  "odds.min.max" lower and upper p-values to be used for odds ratio calculation.
##		default= c(0.001, 0.5)
##  "species"  Used only for KEGG.  human="hsa", mouse="mmu", rat="rno", etc. (no default)
##
##  Outputs:
##  object is dataframe with the following columns:
##	GO or KEGG ID	- posterior t-value for IBMT
##	GO or KEGG term 	- name of category
##	Ontology (only for GO) - BP, MF, or CC
##	n.genes - Number of unique Entrez Gene IDs in category
##	coeff - coefficient of slope (only GO terms with positive values are significant)
##    odds.ratio -  Odds ratio, as measure of strenth of enrichment
## 	p.value	-  P-value that slope does not equal zero (that term is enriched)
##	FDR	- False Discovery Rate (Benjamini & Hochberg, 1995)
##  	sig.genes	- comma separated Entrez gene ids in category with p-value<"sig.cutoff"
##
##    Examples:
##    pvalues <- IBMT.results$IBMT.p[,1]
##    entrez.geneid <- unlist(mget(featureNames(affy.eset),env=hgu133aENTREZID))
##    GO.results <- LRpath(sigvals=pvalues, geneids=entrez.geneid, database="GO")
##    KEGG.results <- LRpath(sigvals=pvalues, geneids=entrez.geneid, database="KEGG",
##		species="hsa")
##
##########################################################################################


library("annotate")
library("stats")
	
if (database=="GO") {
     library("GO")
     ###  Get GO list of Entrez genes
     xx<-as.list(GOALLENTREZID)
     ##Remove GO ids that aren't mapped to any Entrez Gene id
     xx<-xx[!is.na(xx)]

	## Determine which Entrez Gene IDs are annotated anywhere in GO
	## Gene ids not annotated in GO are left out of the environment
     ENTREZ.in.DB<- as.numeric(names(as.list(GOENTREZID2GO)))
}
#########  KEGG works for human, mouse, rat, and any other species that uses Entrez geneID
if (database=="KEGG") {	
	library(KEGG)
	xx<-as.list(KEGGPATHID2EXTID)
	xx<-xx[!is.na(xx)]

	## Determine which Entrez Gene IDs are annotated anywhere in KEGG
	ENTREZ.in.DB<- as.numeric(names(as.list(KEGGEXTID2PATHID)))
	ENTREZ.in.DB<- ENTREZ.in.DB[!is.na(ENTREZ.in.DB)]
}

sigvals[sigvals==0]<- 10^(-15)
geneids2<-geneids[!is.na(geneids)&!is.na(sigvals)&geneids %in% ENTREZ.in.DB]
sigvals2<-sigvals[!is.na(geneids)&!is.na(sigvals)&geneids %in% ENTREZ.in.DB]

uniqids<-unique(geneids2)
numuniq<-length(uniqids)
LOR.mult<-log(odds.min.max[2])-log(odds.min.max[1])

################ Average -log(p-values) for duplicate locusids
nlp<-(-1)*log(sigvals2)
newp<-NA
for (i in (1:numuniq)){
	current<-nlp[geneids2==uniqids[i]]  #Get all values for i-th geneid
	#numrep<-length(current)
	newp[i]<-mean(current)
}
#plot(c(1,2,3),c(1,2,3))
### Limit testing to those categories with at least min.g genes TOTAL
catsizes<-sapply(xx,length)
yy<-xx[catsizes>=min.g]
ncats<-length(yy)

#siggenes <- uniqids[exp(-newp)<0.05]  ##  If averaged p-value < 0.05
siggenes <- uniqids[exp(-newp)<sig.cutoff]

######################################################################
###  Test all categories to which at least min ANALYZED genes belong

LRcoeff<-NA; LRpval<-NA; cattots<-NA; yyind<-NA; GOnums<-NA
catsigIDs<-NA
ind<-0
if (is.na(max.g)) {
	max.g<-99999
}
#for (i in (1:100)){
for (i in (1:ncats)){
	catgenes<-as.vector(sapply(yy[[i]],as.integer)) #as.vector(yy[[1]])
	catgenes<-unique(catgenes)
	catpoprows<-match(catgenes,as.vector(uniqids,mode="integer"))
	catpoprows<-catpoprows[!is.na(catpoprows)]

	cattot<-length(catpoprows)	#Number of analyzed genes in category
	if (cattot>=min.g&cattot<=max.g) {
		ind<-ind+1
		cat<-rep(0,numuniq)
		cat[catpoprows]<- 1
		forLR<- as.data.frame(cbind(cat,newp))
		names(forLR)<-c("cat","nlogp")

		glm.lrGO <- glm(cat ~ nlogp, family=binomial(link="logit"),forLR)
		lrGO<-summary(glm.lrGO)

		###########  Extract p-value and coefficient
		LRcoeff[ind]<-lrGO$coefficients["nlogp","Estimate"]
		LRpval[ind]<-lrGO$coefficients["nlogp","Pr(>|z|)"]
		cattots[ind]<-cattot
		#yyind[ind]<-i
		GOnums[ind]<-names(yy[i])

		catsig.IDlist<-intersect(siggenes,catgenes)
		catsigIDs[ind]<-paste(catsig.IDlist[order(catsig.IDlist)],collapse=", ")
      }
	if ((i-ncats/10)>0&(i-ncats/10)<1.1) {
		print("10% categories finished.")
	}
	if ((i-ncats/5)>0&(i-ncats/5)<1.1) {
		print("20% categories finished.")
	}
	if ((i-ncats/2)>0&(i-ncats/2)<1.1) {
		print("50% categories finished.")
	}
	if ((i-9*ncats/10)>0&(i-9*ncats/10)<1.1) {
		print("90% categories finished.")
	}

}

odds.ratio<-exp(LOR.mult*LRcoeff)
BenjFDR<-p.adjust(LRpval,"BH")

if (database=="GO") {
	############## Annotate with GO names and Ontology
	GOterms<-as.list(GOTERM)
	GOterms<-GOterms[!is.na(GOterms)]
	terms<-as.vector(sapply(GOterms,Term))  # Requires annotate library
	Ontologies<-as.vector(sapply(GOterms,Ontology))
	AllGOids<-as.vector(names(GOterms))
	terms2<-cbind(AllGOids,terms,Ontologies)

	GOrows<-match(GOnums,AllGOids)
	GOannot<-terms2[GOrows,]
	LRresults<-cbind(GOnums,GOannot[,c("terms","Ontologies")],cattots,LRcoeff,
	     odds.ratio,as.data.frame(LRpval),BenjFDR,catsigIDs)
	names(LRresults)<- c("GO.ID", "GO.term", "Ontology", "n.genes", "coeff", "odds.ratio",
		"p.value", "FDR", "sig.genes")
}

if (database=="KEGG") {
	KEGGterms<- as.list(KEGGPATHNAME2ID)
	kterms<- as.vector(names(KEGGterms))
	k.ids<- paste(species,unlist(KEGGterms),sep="")
	keggrows<- match(GOnums,k.ids)
	kegg.annot<-kterms[keggrows]
	LRresults<-cbind(GOnums,kegg.annot,cattots,LRcoeff,
	     odds.ratio,as.data.frame(LRpval),BenjFDR,catsigIDs)
	names(LRresults)<- c("KEGG.ID", "KEGG.pathway", "n.genes", "coeff", "odds.ratio",
		"p.value", "FDR", "sig.genes")
}

LRresults

} #End function LRpath