LRpath<-function(sigvals, geneids, min.g=10, max.g=NA, sig.cutoff=0.05, database="GO", odds.min.max=c(0.001,0.5), species) { ########################################################################################### # Function for LRpath: testing GO terms or KEGG with logistic regression # Written by: Maureen Sartor, University of Cincinnati, 2008 ########################################################################################### ## ## This function uses logistic regression to test for enriched biological ## categories in gene expression data. Our method models the probability ## of a randomly selected gene belonging to a specific category given the ## significance level of that gene. For categories significantly affected by ## the experimental condition, this probability will increase as the significance ## statistic increases. Categories with significant p-values and positive slope ## coefficients are enriched with differentially expressed genes. ## ## Please acknowledge your use of LRpath in publications by referencing: ## Sartor MA, Leikauf GD, and Medvedovic M. LRpath: a logistic regression approach for ## identifying enriched biological groups in gene expression data. (submitted ## to Bioinformatics), 2008. ## ## Inputs: ## 8 parameters: sigvals, geneids, min.g, max.g, sig.cutoff, database, odds.min.max, species ## "sigvals" a vector of p-values, same length and order as "geneids" ## "geneids" a vector of Entrez gene IDs, may contain duplicates and missing values. ## "min.g" The minimum number of unique gene IDs analyzed in category to be tested ## default = 10 ## "max.g" The maximum number of unique gene IDs analyzed in category to be tested ## default = NA (99999) ## "sig.cutoff" Entrez gene IDs in each category with p-values=min.g] ncats<-length(yy) #siggenes <- uniqids[exp(-newp)<0.05] ## If averaged p-value < 0.05 siggenes <- uniqids[exp(-newp)=min.g&cattot<=max.g) { ind<-ind+1 cat<-rep(0,numuniq) cat[catpoprows]<- 1 forLR<- as.data.frame(cbind(cat,newp)) names(forLR)<-c("cat","nlogp") glm.lrGO <- glm(cat ~ nlogp, family=binomial(link="logit"),forLR) lrGO<-summary(glm.lrGO) ########### Extract p-value and coefficient LRcoeff[ind]<-lrGO$coefficients["nlogp","Estimate"] LRpval[ind]<-lrGO$coefficients["nlogp","Pr(>|z|)"] cattots[ind]<-cattot #yyind[ind]<-i GOnums[ind]<-names(yy[i]) catsig.IDlist<-intersect(siggenes,catgenes) catsigIDs[ind]<-paste(catsig.IDlist[order(catsig.IDlist)],collapse=", ") } if ((i-ncats/10)>0&(i-ncats/10)<1.1) { print("10% categories finished.") } if ((i-ncats/5)>0&(i-ncats/5)<1.1) { print("20% categories finished.") } if ((i-ncats/2)>0&(i-ncats/2)<1.1) { print("50% categories finished.") } if ((i-9*ncats/10)>0&(i-9*ncats/10)<1.1) { print("90% categories finished.") } } odds.ratio<-exp(LOR.mult*LRcoeff) BenjFDR<-p.adjust(LRpval,"BH") if (database=="GO") { ############## Annotate with GO names and Ontology GOterms<-as.list(GOTERM) GOterms<-GOterms[!is.na(GOterms)] terms<-as.vector(sapply(GOterms,Term)) # Requires annotate library Ontologies<-as.vector(sapply(GOterms,Ontology)) AllGOids<-as.vector(names(GOterms)) terms2<-cbind(AllGOids,terms,Ontologies) GOrows<-match(GOnums,AllGOids) GOannot<-terms2[GOrows,] LRresults<-cbind(GOnums,GOannot[,c("terms","Ontologies")],cattots,LRcoeff, odds.ratio,as.data.frame(LRpval),BenjFDR,catsigIDs) names(LRresults)<- c("GO.ID", "GO.term", "Ontology", "n.genes", "coeff", "odds.ratio", "p.value", "FDR", "sig.genes") } if (database=="KEGG") { KEGGterms<- as.list(KEGGPATHNAME2ID) kterms<- as.vector(names(KEGGterms)) k.ids<- paste(species,unlist(KEGGterms),sep="") keggrows<- match(GOnums,k.ids) kegg.annot<-kterms[keggrows] LRresults<-cbind(GOnums,kegg.annot,cattots,LRcoeff, odds.ratio,as.data.frame(LRpval),BenjFDR,catsigIDs) names(LRresults)<- c("KEGG.ID", "KEGG.pathway", "n.genes", "coeff", "odds.ratio", "p.value", "FDR", "sig.genes") } LRresults } #End function LRpath