#Do analysis on uclust output uclust.analysis<-function(fi,fo,c.names='None',k=0){ library(gplots) library(multicore) clust<-read.table(file=fi,stringsAsFactors=FALSE,comment.char='#') print(paste("Read in",fi)) if(c.names!='None'){ t.names<-read.table(file=c.names,stringsAsFactors=FALSE,sep="\t") c.names<-c() c.names[as.character(t.names$V1)]<-sapply(t.names$V2,function(x){paste(strsplit(x," ")[[1]],collapse="")}) rm(t.names) } pdf(file=fo) colnames(clust)<-c('V1','cluster','len','V4','V5','V6','V7','V8','query','target') cluster.hist(clust) sample.heatmap(clust) #cluster.heatmap(clust,c.names=c.names,k=k) dev.off() } cluster.hist<-function(clust){ print("Making cluster histogram") clust<-subset(clust,V1 %in% c('C','H')) clust.num<-sapply(unique(clust$cluster),function(x){sum(clust$cluster==x)}) max.clust.num<-as.numeric(max(clust.num)) plot(c(1,6),c(1,max.clust.num),type='n',xaxt='n',yaxt='n',xlab='',ylab='') text(3,0,label=paste("Total number of ORFs: ",nrow(clust))) for(q in c(1:max.clust.num)){ text(3,q,label=paste("Protein families with",q,"members: ",sum(clust.num==q))) } hist(clust.num,xlab='Number of proteins in cluster',ylab='Number of clusters',main='Histogram of cluster size') hist(clust.num[clust.num>1],xlab='Number of proteins in cluster',ylab='Number of clusters',main='Histogram of cluster size') hist(clust.num[clust.num>3],xlab='Number of proteins in cluster',ylab='Number of clusters',main='Histogram of cluster size') hist(clust.num[clust.num>5],xlab='Number of proteins in cluster',ylab='Number of clusters',main='Histogram of cluster size') } sample.heatmap<-function(clust){ print("Making sample heatmap") clust<-subset(clust,V1 %in% c('C','H')) clust$size<-sapply(clust$cluster,function(x){sum(clust$cluster==x)}) clust<-subset(clust,size>1) #First make matrix of occurance in different samples clust$samp<-substr(clust$query,1,4) samps<-sort(unique(clust$samp)) samp.mat<-matrix(0,nrow=length(samps),ncol=length(samps),dimnames=list(samps,samps)) for(p in samps){ for(q in samps){ #Calculates the number of protein families that are found in both samples samp.mat[p,q]<-length(intersect(clust$cluster[clust$samp==p],clust$cluster[clust$samp==q])) } } print(samp.mat) heatmap.2(samp.mat,density.info='none',trace='none',cellnote=samp.mat,notecol='black') } cluster.heatmap<-function(clust,c.names='None',k=0){ clust$cluster<-as.character(clust$cluster) clust<-subset(clust,V1 %in% c('C','H')) #Remove singleton clusters clust$size<-sapply(clust$cluster,function(x){sum(clust$cluster==x)}) clust<-subset(clust,size>1) #Get the contig that each orf is on clust$contig<-sapply(clust$query,function(x){paste(strsplit(x,'_')[[1]][1:2],collapse='_')}) #I want to eliminate clusters that are only on contigs that ONLY HAVE THAT CLUSTER clust$cooccurringClusts<-sapply(clust$cluster,function(x){ t.contigs<-unique(clust$contig[clust$cluster==x]) return(length(unique(clust$cluster[clust$contig %in% t.contigs]))-1) }) #Use the names provided by c.names if(length(c.names)>1){ print("Appending provided names for clusters") clust$cluster<-sapply(clust$cluster,function(x){ if(x %in% names(c.names)){ return(paste(c.names[x],x,sep="")) } else { return(x) } }) } #It seems like the number of cooccurring clusters is a great way to filter the data into only showing the big clusters #HOWEVER I don't end up seeing a graph with only five connections between nodes, possibly because I end up taking a threshold. #THEREFORE I will eliminiate this step to avoid biasing myself against strong, exclusive interactions #clust<-subset(clust,cooccurringClusts>=5) #print.clust.groups(clust,k=k) print("Printing all groups") #I think that the min.overlap (below) should be the same as the cooccurringClusts (above) print.all.clusts(clust,dotfo='overlap5thresh2.dot',dot.thresh=2) print.all.clusts(clust,dotfo='overlap5thresh3.dot',dot.thresh=3) print.all.clusts(clust,dotfo='overlap5thresh4.dot',dot.thresh=4) print.all.clusts(clust,dotfo='overlap5thresh5.dot',dot.thresh=5) } print.all.clusts<-function(clust,min.overlap=1,dotfo='None',dot.thresh=2){ #Make matrix of occurance on the same contig #clust.conts will just contain which contigs match to what clusters clust.conts<-lapply(unique(clust$cluster),function(x){return(unique(clust$contig[clust$cluster==x]))}) clusts<-names(clust.conts)<-unique(clust$cluster) if(dotfo != 'None'){ print.dot(clust.conts,dotfo,thresh=dot.thresh) } else { clust.mat<-do.call(rbind,mclapply(clusts,function(p){ sapply(clusts,function(q){ #This function is ~50X faster than using the xtabs method (previously) length(intersect(clust.conts[[p]],clust.conts[[q]])) }) },mc.cores=10)) rownames(clust.mat)<-colnames(clust.mat)<-clusts clust.mat<-filter.mat(clust.mat,min.overlap) print(dim(clust.mat)) if(is.matrix(clust.mat)){ heatmap.2(clust.mat,density.info='none',trace='none',col=colorpanel(1000,'white','black')) } else {print("Did not pass filter")} } } print.dot<-function(clust.conts,dotfo,thresh=0){ #This will make a simple DOT format file that can be made into a network diagram by graphviz (externally to this script) breaks<-write.key(thresh,dotfo) for(q in c(1:(length(clust.conts)-1))){ for(p in c((q+1):length(clust.conts))){ #The COLOR of the line will be determined by the NUMBER OF CONTIGS that have BOTH CLUSTERS len<-length(intersect(clust.conts[[p]],clust.conts[[q]])) if(len>thresh){ col<-'blue' if(len>=breaks[1]){col<-'green'} if(len>=breaks[2]){col<-'orange'} if(len>=breaks[3]){col<-'red'} #len<-((len-thresh)/thresh)^2 len<-1 conn<-paste(names(clust.conts[p]),"--",names(clust.conts[q]),"[color =",col,", weight =",len,"];") write(conn,file=dotfo,append=TRUE) } } } write("}",file=dotfo,append=TRUE) } write.key<-function(thresh,dotfo){ #Make the breaks by intervals of 2, starting with the threshold. #Return the break points (ommitting the thresh) breaks<-c(thresh+2,thresh+4,thresh+6) ordinals<-c('0'='ZERO','1'='ONE','2'='TWO','3'='THREE','4'='FOUR','5'='FIVE','6'='SIX','7'='SEVEN','8'='EIGHT','9'='NINE','10'='TEN','11'='ELEVEN','12'='TWELVE','13'='THIRTEEN','14'='FOURTEEN') write("graph G {",file=dotfo) key<-paste(ordinals[as.character(thresh)],'to',ordinals[as.character(breaks[1]-1)]," [fontcolor = blue, color = blue] ",ordinals[as.character(breaks[1])],'to',ordinals[as.character(breaks[2]-1)]," [fontcolor = green, color = green] ",ordinals[as.character(breaks[2])],'to',ordinals[as.character(breaks[3]-1)]," [fontcolor = orange, color = orange] ",ordinals[as.character(breaks[3])],"andOVER [fontcolor = red, color = red]",sep="") write(key,file=dotfo,append=TRUE) return(breaks) } print.clust.groups<-function(clust,k=0){ library(clusterSim) #The input is a similarity matrix of clusters #The goal is to break it up into the optimal number of groups, and then display each of those groups individually clust.tab<-xtabs(~cluster+contig,data=clust) print(paste("There are",nrow(clust.tab),"protein families among",ncol(clust.tab),"contigs")) if(k==0){ print("Finding optimal number of groups") #k.score<-do.call(c,mclapply(c(1:(nrow(unique(clust.tab)))),function(q){ #for(q in c(1:unique.rows(clust.tab))){ k.score<-c() for(q in c(1:nrow(clust.tab))){ k<-'None' try(k<-kmeans(clust.tab,q),silent=TRUE) if(k!='None'){ #F<-CHwrapper(clust.tab,k$cluster) F<-k$tot.withinss/k$betweenss #print(paste(q,F,q*F)) k.score<-c(k.score,q*F) } } k.min<-which(k.score==min(k.score[-1]))[1] print(paste("Optimal number of protein clusters is",k.min)) } else { print(paste("Using user-specified number of clusters:",k)) k.min<-k } k<-kmeans(clust.tab,k.min,iter.max=50) for(q in unique(k$cluster)){ t.clusters<-names(k$cluster)[k$cluster==q] if(length(t.clusters)>2){ print(paste("Printing group",q,"with",length(t.clusters),"members")) t.clust<-subset(clust,clust$cluster %in% t.clusters) print.all.clusts(t.clust,dotfo=paste("clust",q,".dot",sep="")) } } } CHwrapper<-function(clust.tab,t.cluster,max.iter=100){ #index.G1 does not always work, and I'm not sure why. Keep on trying until it works q<-1 while(q<=max.iter){ out<-'None' try(out<-index.G1(clust.tab,t.cluster),silent=TRUE) if(is.numeric(out)){return(out)} q<-q+1 } return('None') } unique.rows<-function(big.tab){ return(length(unique(do.call(c,mclapply(big.tab,1,function(x){paste(x,collapse="")},mc.cores=10))))) } filter.mat<-function(tab,thr){ rows<-apply(tab,1,function(x){sum(x>0)}) cols<-apply(tab,2,function(x){sum(x>0)}) return(tab[rows>=thr,cols>=thr]) }