library(multicore) library(ggplot2) #This will be executed in a folder and execute an analysis of protein networks read.in.orf.pos<-function(fi){ out<-read.table(file=fi,stringsAsFactors=FALSE) out<-out[,c(1,2,3,4,5)] names(out)<-c('orf','contig','start','stop','ori') out$ori<-as.numeric(sapply(out$ori,function(x){x>0})) out$obs.ori<-as.numeric(sapply(c(1:nrow(out)),function(x){out$start[x]=min.size] if(length(t.modules)y){ x.orfs<-subset(orf.pos,orf %in% orf.clusters[[x]]) y.orfs<-subset(orf.pos,orf %in% orf.clusters[[y]]) shared.contigs<-intersect(x.orfs$contig,y.orfs$contig) if(length(shared.contigs)>0){ t.orien<-c() for(t.contig in shared.contigs){ x.row<-which(x.orfs$contig==t.contig)[1] x.start<-x.orfs$start[x.row] x.stop<-x.orfs$stop[x.row]-x.start y.row<-which(y.orfs$contig==t.contig)[1] y.start<-y.orfs$start[y.row]-x.start y.stop<-y.orfs$stop[y.row]-x.start x.start<-0 if(x.stop<0){x.stop<-x.stop*-1; y.start<-y.start*-1; y.stop<-y.stop*-1} if(y.start>0 & y.stop>y.start){t.orien<-c(t.orien,1)} ## x-> y-> if(y.start>0 & y.stop <-y if(y.start<0 & y.stop>y.start){t.orien<-c(t.orien,3)} ## y-> x-> if(y.start<0 & y.stop x-> } out<-c(out,max(table(t.orien))/length(t.orien)) } } } } return(mean(out)) } temp.hist<-function(t.values,xlab='',ylab='',main='',log='none',col='black'){ if(max(t.values)<2){ p<-qplot(t.values,geom='histogram',xlab=xlab,ylab=ylab,main=main,log=log,binwidth=0.1) }else{ p<-qplot(t.values,geom='histogram',xlab=xlab,ylab=ylab,main=main,log=log,binwidth=1) } print(p+opts(panel.background=theme_rect(fill=NA))+theme_bw()) } output.summary.stats<-function(cluster.info,clusts.in.modules,orf.pos,orf.clusters,clusters,modules,fo.base){ clusts.mod.dict<-c() seen.clusts<-c() for(x in names(clusts.in.modules)){ for(y in clusts.in.modules[[x]]){ if(y %in% seen.clusts){print(paste("Error:",y,"in multiple modules"))} seen.clusts<-c(seen.clusts,y) clusts.mod.dict[y]<-x } } cluster.info$Module<-as.character(sapply(cluster.info$cluster,function(x){if(x %in% names(clusts.mod.dict)){return(clusts.mod.dict[x])}else{return('None')}})) cluster.info$ModuleSize<-as.numeric(sapply(cluster.info$Module,function(x){if(x %in% names(clusts.in.modules)){return(length(clusts.in.modules[[x]]))}else{return(0)}})) cluster.info<-cluster.info[order(cluster.info$size),] cluster.info<-cluster.info[order(cluster.info$Module),] cluster.info<-cluster.info[rev(order(cluster.info$ModuleSize)),] names(cluster.info)<-c('Cluster','Size','Annotation','Module','ModuleSize') write.table(as.matrix(cluster.info),file=paste(fo.base,'.cluster.info.tsv',sep=""),sep="\t",quote=FALSE,row.names=FALSE,col.names=TRUE) pdf(file=paste(fo.base,'summary.pdf',sep='.')) temp.hist(cluster.info$Size,xlab='Number of proteins in group',ylab='Number of groups',main='Proteins grouped across contigs',log='y') temp.hist(cluster.info$Size[cluster.info$Annotation!='None'],xlab='Number of proteins in group',ylab='Number of groups',main='Proteins grouped across contigs, annotated by CDD',log='y') temp.hist(cluster.info$Size[cluster.info$Annotation=='None'],xlab='Number of proteins in group',ylab='Number of groups',main='Proteins grouped across contigs, UNannotated by CDD',log='y') #Now make some linkage stats by module #Module Proteins TotalContigs MeanContigs MeanOverlap MeanOverlapScore MeanOrientation linkage.stats<-do.call(rbind,lapply(names(clusts.in.modules),function(t.module){ #for(t.module in names(clusts.in.modules)) t.proteins<-clusts.in.modules[[t.module]] tot.contigs<-modules[[t.module]] mean.contigs<-mean(c(calc.overlap.scores(clusters[t.proteins])$size,calc.overlap.scores(clusters[t.proteins])$min.size)) mean.overlap<-mean(calc.overlap.scores(clusters[t.proteins])$score*calc.overlap.scores(clusters[t.proteins])$min.size) mean.overlap.score<-mean(calc.overlap.scores(clusters[t.proteins])$score) mean.orientation<-find.mean.orientation(t.proteins,orf.clusters,orf.pos) return(data.frame(Module=t.module,Proteins=length(t.proteins),TotalContigs=length(tot.contigs),MeanContigs=mean.contigs,MeanOverlap=mean.overlap,MeanOverlapScore=mean.overlap.score,MeanOrientation=mean.orientation,stringsAsFactors=FALSE)) })) linkage.stats<-linkage.stats[rev(order(linkage.stats$Proteins)),] write.table(as.matrix(linkage.stats),file=paste(fo.base,'.linkage.tsv',sep=''),sep="\t",quote=FALSE,row.names=FALSE,col.names=TRUE) temp.hist(linkage.stats$Proteins,xlab='Number of protein groups in module',ylab='Number of modules',main='Protein clusters grouping into modules within contigs') temp.hist(linkage.stats$TotalContigs,xlab='Number of contigs in module',ylab='Number of modules',main='Contigs used to group together proteins into modules') temp.hist(linkage.stats$MeanOrientation,xlab='Mean orientation score of that module',main='Modules by orientation score') dev.off() } make.col.dict<-function(t.names){ out<-c() cols<-c('brown1','chocolate1','darkgoldenrod1','chartreuse','blueviolet','lightcoral','lightgoldenrod1','greenyellow','lightslateblue','hotpink') t.vals<-rep(cols,times=10)[1:length(t.names)] out[t.names]<-t.vals return(out) } sort.contig.pos<-function(contig.pos,t.orf.pos,t.orf.clusters){ t.clust.counts<-c() t.clust.counts[names(t.orf.clusters)]<-as.numeric(do.call(c,lapply(t.orf.clusters,length))) t.clusts<-unique(t.orf.pos$cluster)[!is.na(unique(t.orf.pos$cluster))] for(t.clust in names(t.clust.counts)[rev(order(as.numeric(t.clust.counts)))]){ contig.pos[,t.clust]<-as.numeric(sapply(contig.pos$contig,function(x){any(t.orf.pos$cluster[t.orf.pos$contig==x]==t.clust)})) contig.pos[is.na(contig.pos[,t.clust]),t.clust]<-0 contig.pos<-contig.pos[order(contig.pos[,t.clust]),] } oriented<-c() for(protein.anchor in names(t.clust.counts)[rev(order(as.numeric(t.clust.counts)))]){ anchor.orfs<-t.orf.clusters[[protein.anchor]] #What position shall we anchor this orf to? anchor.orf.pos<-t.orf.pos[which(t.orf.pos$orf %in% anchor.orfs & t.orf.pos$contig %in% oriented)[1],] if(any(is.na(anchor.orf.pos))){ anchor.orf.pos<-t.orf.pos[which(t.orf.pos$orf %in% anchor.orfs)[1],] } anchor.orf.start<-as.numeric(anchor.orf.pos$start) anchor.orf.stop<-as.numeric(anchor.orf.pos$stop) if(anchor.orf.start0){ oriented<-c(oriented,t.con) con.orf.pos<-con.orf.pos[1,] if(con.orf.pos$start0){ for(t.clust in clust.to.plot){ for(r1 in which(t.orf.pos$level==q)){ for(r2 in which(t.orf.pos$level==(q-1))){ if(!any(is.na(t.orf.pos$cluster[c(r1,r2)]))){ if(t.orf.pos$cluster[r1]==t.orf.pos$cluster[r2]){ polygon(c(t.orf.pos[r1,3:4],t.orf.pos[r2,4:3]),c(q,q,q-1,q-1),border=NA,col=col.dict[t.orf.pos$cluster[r1]]) } } } polygon(c(t.orf.pos[r1,3:4],t.orf.pos[r1,4:3]),c(q-0.1,q-0.1,q,q),border=NA,col=col.dict[t.orf.pos$cluster[r1]]) } } } } for(q in c(1:nrow(t.orf.pos))){ t.lev<-t.orf.pos[q,6] t.start<-t.orf.pos[q,3] t.stop<-t.orf.pos[q,4] if(is.na(t.orf.pos[q,5])){t.col='black'}else{ t.col='blue' #t.lab<-cluster.info$annot[which(cluster.info$cluster==t.orf.pos[q,5])[1]] #if(t.lab=='None'){t.lab<-x} #text(mean(c(t.start,t.stop)),t.lev+0.2,labels=t.lab,srt=90,pos=3,cex=nrow(t.orf.pos)^(-0.25)) } if(t.stop>t.start){ polygon(c(t.start,t.stop,t.stop-(0.2*(t.stop-t.start)),t.stop-(0.2*(t.stop-t.start)),t.start),c(t.lev,t.lev,t.lev+0.3,t.lev+0.2,t.lev+0.2),border=NA,col=t.col) }else{ polygon(c(t.start,t.stop,t.stop+(0.2*(t.start-t.stop)),t.stop+(0.2*(t.start-t.stop)),t.start),c(t.lev,t.lev,t.lev-0.3,t.lev-0.2,t.lev-0.2),border=NA,col=t.col) } } names(col.dict)<-sapply(names(col.dict),function(x){ t.lab<-cluster.info$annot[which(cluster.info$cluster==x)[1]] if(t.lab=='None'){t.lab<-x} return(t.lab)}) legend('topright',names(col.dict),fill=as.character(col.dict),cex=0.75) } dev.off() } module.wrapper<-function(len.table='None.scafTable', cluster.table='None.clstr.tsv',orf.pos='None.predict.formatted',fo.base='output'){ #Read in length table temp<-read.table(file=len.table,sep='\t',stringsAsFactors=FALSE) contig.len<-c() contig.len[temp$V1]<-temp$V3 #Read in output from uclust.sh orf.cluster.dict<-make.cluster.dict(cluster.table) #Read in output from glimmer showing the position of each orf orf.pos<-read.in.orf.pos(orf.pos) #Exclude orfs that don't have valid positions (worried about circular orf constructions, which are weeded out through read.in.orf.pos orf.cluster.dict<-orf.cluster.dict[names(orf.cluster.dict) %in% orf.pos$orf] #Let's keep track of each cluster. How many orfs in it, cdd annotation cluster.info<-data.frame(cluster=unique(as.character(orf.cluster.dict)),stringsAsFactors=FALSE) cluster.info$size<-as.numeric(sapply(cluster.info$cluster,function(x){sum(as.character(orf.cluster.dict)==x)})) cluster.info<-subset(cluster.info,size>1,stringsAsFactors=FALSE) #This version does not support CDD annotation cluster.info$annot<-'None' #The two things that I have to work with now are orf.pos (which tells me where each orf is) and cluster.info (which summarizes each cluster) #I now want to organize protein (clusters) into modules #Start with the largest protein cluster. Progressively add proteins. #Let's make a list with the names being those of each protein cluster, and the value will be the contigs that those proteins are on #Each time a protein is added to a module, we remove the protein and update the module values to reflect the contigs that it is on. clusters<-lapply(cluster.info$cluster,function(x){as.character(names(orf.cluster.dict)[as.character(orf.cluster.dict)==x])}) names(clusters)<-cluster.info$cluster orf.clusters<-clusters #I think that for the purpose of this module calculation, we can simplify 'clusters' so that it only has the contig, and not the protein names clusters<-lapply(clusters,function(x){unique(as.character(sapply(x,function(y){return(strsplit(y,'\\.')[[1]][1])})))}) #Let's cluster the proteins into modules modules<-clusters print(head(modules)) stop<-FALSE min.size<-16 mod.num<-1 clusts.in.modules<-list() #Names are modules, values are vectors of protein clusters while(stop==FALSE){ print("Calculating overlap scores") overlap.scores<-calc.overlap.scores(modules,min.size=min.size) if(nrow(overlap.scores)==0){ min.size<-min.size-1 if(min.size<4){stop<-TRUE;print("Stopping loop")}else{print(paste("New cuttoff is",min.size))} ########### } else { hits<-subset(overlap.scores,score>=0.8) if(nrow(hits)==0){ min.size<-min.size-1 if(min.size<4){stop<-TRUE;print("Stopping loop")}else{print(paste("New cuttoff is",min.size))} ########### } else { hits<-hits[rev(order(hits$size)),] for(r in c(1:nrow(hits))){ prot1<-hits$prot1[r] prot2<-hits$prot2[r] if(prot1 %in% names(modules) & prot2 %in% names(modules)){ #This would be false if the protein had been merged earlier print(paste("Merging",prot1,"and",prot2)) if(prot1 %in% names(clusts.in.modules)){ new.name<-prot1 }else{ if(prot2 %in% names(clusts.in.modules)){ new.name<-prot2 } else { new.name<-paste('Module',mod.num,sep='_') mod.num<-mod.num+1 } } #Now we know what we are calling the (possibly new) module if(prot1 %in% names(clusts.in.modules)){prot1.prots<-clusts.in.modules[[prot1]]}else{prot1.prots<-prot1} if(prot2 %in% names(clusts.in.modules)){prot2.prots<-clusts.in.modules[[prot2]]}else{prot2.prots<-prot2} clusts.in.modules[[new.name]]<-c(prot1.prots,prot2.prots) #Now clusts.in.modules is updated t.contigs<-unique(c(modules[[prot1]],modules[[prot2]])) modules<-modules[-which(names(modules)==prot1)] modules<-modules[-which(names(modules)==prot2)] modules[[new.name]]<-t.contigs } } } } } #Now I want to output some summary of what this clustering has shown me #Firstly, let's output some summary stats output.summary.stats(cluster.info,clusts.in.modules,orf.pos,orf.clusters,clusters,modules,fo.base) print.module.contigs(orf.pos,clusts.in.modules,orf.clusters,cluster.info,contig.len,fo.base) }