From f73d612815890a8d5540bd2dd3c7a58a12782a46 Mon Sep 17 00:00:00 2001 From: Samuel Mondy <samuel.mondy@inra.fr> Date: Fri, 13 Dec 2019 10:30:07 +0100 Subject: [PATCH 1/2] Update R_amplicon_analysis --- R_amplicon_analysis | 280 ++++++++++++++++++++++---------------------- 1 file changed, 139 insertions(+), 141 deletions(-) diff --git a/R_amplicon_analysis b/R_amplicon_analysis index 04f8cce..f6183f1 100644 --- a/R_amplicon_analysis +++ b/R_amplicon_analysis @@ -1,4 +1,3 @@ - library("stats") library("graphics") library("grDevices utils") @@ -27,9 +26,7 @@ library("caTools") library("tcltk") library("vegan") library("ggplot2") - - - +library("plyr") Ten_max<-function(ref,index=20) @@ -90,49 +87,49 @@ add.alpha <- function(col, alpha=0.3){ ###chi square test with only 2 variables -chi_sam<-function(b) -######################################### +chi_sam<-function(b,seuil=0.05) + ######################################### ######################################### { -t<-apply(b,1,sum) -selec<-which(t>0) -b<-b[selec,] -t<-apply(b,2,sum) -selec<-which(t>0) -b<-b[,selec] -y<-chisq.test(b) -print(y) -print(y$expected) -print(y$observed) -print (y$p.value) -if(y$p.value<0.05) -{ -if(dim(b)[2]>2) -{ -z<-combn(dim(b)[2],2) -for(i in 1:dim(z)[2]) -{ -r<-b[,c(z[1,i],z[2,i])] -s<-apply(r,1,sum) -selec<-which(s>0) -r<-r[selec,] -x<-chisq.test(r) -#print(paste(colnames(b)[z[1,i]],colnames(b)[z[2,i]],sep=" vs ")) -#print(x$expected) -#print(x$observed) -if(x$p.value<(0.05/dim(z)[2])) -{ -print(paste(colnames(b)[z[1,i]],colnames(b)[z[2,i]],sep=" vs ")) -print(x$p.value) -} -} -print(paste("corrected p-value ",(0.05/dim(z)[2]),sep="")) -} -else -{ -print (y$p.value) -} -} + t<-apply(b,1,sum) + selec<-which(t>0) + b<-b[selec,] + t<-apply(b,2,sum) + selec<-which(t>0) + b<-b[,selec] + y<-chisq.test(b) + print(y) + print(y$expected) + print(y$observed) + print (y$p.value) + if(y$p.value<seuil) + { + if(dim(b)[2]>2) + { + z<-combn(dim(b)[2],2) + for(i in 1:dim(z)[2]) + { + r<-b[,c(z[1,i],z[2,i])] + s<-apply(r,1,sum) + selec<-which(s>0) + r<-r[selec,] + x<-chisq.test(r) + #print(paste(colnames(b)[z[1,i]],colnames(b)[z[2,i]],sep=" vs ")) + #print(x$expected) + #print(x$observed) + if(x$p.value<(seuil/dim(z)[2])) + { + print(paste(colnames(b)[z[1,i]],colnames(b)[z[2,i]],sep=" vs ")) + print(x$p.value) + } + } + print(paste("corrected p-value ",(seuil/dim(z)[2]),sep="")) + } + else + { + print (y$p.value) + } + } } @@ -149,22 +146,25 @@ print (y$p.value) #threshold is the threshold defined above in the Ten_max() function. -ensemble<-function(a,b,name,threshold) +ensemble<-function(a,b,name,threshold,seuil=0.05) { sum_a<-apply(a,2,sum) if(length(which(sum_a==0))>0) { - a<-a[,-which(sum_a==0)] + a<-a[,-which(sum_a==0)] } #heatmap if(dim(a)[2]<threshold) { aprime<-t(a) - } else { + } + else + { aprime<-t(Ten_max(t(a),threshold)) } + write.table(aprime, paste(Sys.Date(),name,"Ten_max",threshold,".txt",sep="_"), row.names=T, sep="\t",quote=F) - + logaprime <- log2(aprime) logaprime[logaprime=="-Inf"] <- c(0) bprime <- data.frame(unclass(b)) @@ -205,12 +205,7 @@ ensemble<-function(a,b,name,threshold) #dev.off() - if(dim(a)[2]<threshold) - { - aprime<-t(Ten_max(t(a),dim(a)[2])) - } else { - aprime<-t(Ten_max(t(a),threshold)) - } + v<-apply(aprime,1,sum) w<-apply(a,1,sum) pourseq<-mean(v/w)*100 @@ -221,36 +216,44 @@ ensemble<-function(a,b,name,threshold) k<- dim(b)[2] ######################################## - #en foncion de 1 parametre + #en fonction de 1 parametre ######################################## - for(x in 2:k) + for(x in 1:k) { z=1 y=1 pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[x],y,".pdf",sep="_"),width=15,height=15) - par(mfrow=c(3,2)) + par(mfrow=c(3,2), mar=c(7,4,6,4)+2) for(m in 1:dim(aprime)[2]) { - test <- kruskal.test(aprime[,m]~bprime[,x]) - if(test$p.value<0.05) - { - if(z%%6==0) - { - boxplot(aprime[,m]~bprime[,x],main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3) - dev.off() - y=y+1 - z=1 - pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[x],y,".pdf",sep="_"),width=30,height=15) - par(mfrow=c(3,2)) - } + condition <- length(which(!is.na(unique(bprime[,x]))==1)) + if(condition[1]>1) + { + test <- kruskal.test(aprime[,m]~bprime[,x]) + if(test$p.value<seuil) + { + if(z%%6==0) + { + boxplot(aprime[,m]~bprime[,x],main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3,las=2) + dev.off() + y=y+1 + z=1 + pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[x],y,".pdf",sep="_"),width=30,height=15) + par(mfrow=c(3,2), mar=c(7,4,6,4)+2) + } + else + { + boxplot(aprime[,m]~bprime[,x],main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3,las=2) + z <- z+1 + } + } + } else - { - boxplot(aprime[,m]~bprime[,x],main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3) - z <- z+1 - } + { + print("kruskal not allowed") + } } - } dev.off() @@ -269,16 +272,16 @@ ensemble<-function(a,b,name,threshold) d <- c() for(i in 1:dim(c)[1]){d[i] <- paste(c[i,c(1)],collapse="_")} rownames(temp) <- d - chi_sam(t(temp)) - - + if(dim(t(temp))[2]>1) + { + chi_sam(t(temp),seuil) pdf(file=paste("BGA",Sys.Date(),name,colnames(bprime)[x],".pdf",sep="_"),width=15,height=15) { bga1 <- bca(x = datas, fac = as.factor(bprime[,x]), scannf = FALSE, nf = 5) s.class(datas$li, fac = as.factor(bprime[,x]), # colorer par groupes - col = c("black","blue","red","green","orange","yellow","grey","darkblue")) + col = c("black","blue","red","darkgreen","orange","purple","grey","darkblue")) } dev.off() @@ -316,7 +319,7 @@ ensemble<-function(a,b,name,threshold) print(graph) dev.off() - + } } @@ -324,46 +327,43 @@ ensemble<-function(a,b,name,threshold) ######################################################### #en foncion de 2 parametres ######################################################### - comb <- combn(2:k,2) + comb <- combn(1:k,2) for(x in 1:dim(comb)[2]) { temp1 <- comb[1,x] temp2 <- comb[2,x] + print(paste(colnames(b)[temp1],colnames(b)[temp2],sep="_")) { - if(dim(a)[2]<threshold) - { - aprime<-t(Ten_max(t(a),dim(a)[2])) - } - else - { - aprime<-t(Ten_max(t(a),threshold)) - } grouping <- as.factor(paste(b[,temp1],b[,temp2], sep="_")) z=1 y=1 pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[temp1],colnames(b)[temp2],y,".pdf",sep="_"),width=15,height=15) - par(mfrow=c(3,2)) + par(mfrow=c(3,2), mar=c(7,4,6,4)+2) for(m in 1:dim(aprime)[2]) { - if(test$p.value<0.05) - { - - test <- kruskal.test(aprime[,m]~grouping) - if(z%%6==0) + condition <- length(which(!is.na(unique(grouping))==1)) + if(condition[1]>1) { - boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3) - dev.off() - y=y+1 - z=1 - pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[temp1],colnames(b)[temp2],y,".pdf",sep="_"),width=30,height=15) - par(mfrow=c(3,2)) - } - else + test<-kruskal.test(aprime[,m]~grouping) + if(test$p.value<seuil) { - boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3) - z <- z+1 + + if(z%%6==0) + { + boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3,las=2) + dev.off() + y=y+1 + z=1 + pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[temp1],colnames(b)[temp2],y,".pdf",sep="_"),width=30,height=15) + par(mfrow=c(3,2), mar=c(7,4,6,4)+2) + } + else + { + boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3,las=2) + z <- z+1 + } + } } - } } dev.off() @@ -373,8 +373,8 @@ ensemble<-function(a,b,name,threshold) s.class(datas$li, fac = grouping, - col = c("black","blue","red","green","orange","yellow","grey","darkblue")) - } + col = c("black","blue","red","darkgreen","orange","purple","grey","darkblue")) + } dev.off() c<-aggregate(a, list(a=b[,temp1],b=b[,temp2]), FUN=sum) @@ -384,9 +384,9 @@ ensemble<-function(a,b,name,threshold) d <- c() for(i in 1:dim(temp3)[1]){d[i] <- paste(temp3[i,c(1:2)],collapse="_")} rownames(c) <- d - chi_sam(t(c)) - - + if(dim(t(c))[2]>1) + { + chi_sam(t(c),seuil) #NMDS @@ -419,51 +419,49 @@ ensemble<-function(a,b,name,threshold) } dev.off() + } } - ######################################################### #en foncion de 3 parametres ######################################################### - comb <- combn(2:k,3) + comb <- combn(1:k,3) for(x in 1:dim(comb)[2]) { temp1 <- comb[1,x] temp2 <- comb[2,x] temp3 <- comb[3,x] - if(dim(a)[2]<20) - { - aprime<-t(Ten_max(t(a),dim(a)[2])) - } else { - aprime<-t(Ten_max(t(a),threshold)) - } grouping <- as.factor(paste(b[,temp1],b[,temp2],b[,temp3], sep="_")) z=1 y=1 pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[temp1],colnames(b)[temp2],colnames(b)[temp3],y,".pdf",sep="_"),width=15,height=15) - par(mfrow=c(3,2)) + par(mfrow=c(3,2), mar=c(7,4,6,4)+2) for(m in 1:dim(aprime)[2]) { - test <- kruskal.test(aprime[,m]~grouping) - if(test$p.value<0.05) + condition <- length(which(!is.na(unique(bprime[,x]))==1)) + if(condition[1]>1) + { + test <- kruskal.test(aprime[,m]~grouping) + if(test$p.value<seuil) { - if(z%%6==0) - { - boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3) - dev.off() - y=y+1 - z=1 - pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[temp1],colnames(b)[temp2],colnames(b)[temp3],y,".pdf",sep="_"),width=30,height=15) - par(mfrow=c(3,2)) - } - else - { - boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.4) - z <- z+1 - } + if(z%%6==0) + { + boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3,las=2) + dev.off() + y=y+1 + z=1 + pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[temp1],colnames(b)[temp2],colnames(b)[temp3],y,".pdf",sep="_"),width=30,height=15) + par(mfrow=c(3,2), mar=c(7,4,6,4)+2) + } + else + { + boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.4, las=2) + z <- z+1 + } + } } } dev.off() @@ -474,7 +472,7 @@ ensemble<-function(a,b,name,threshold) s.class(datas$li, fac = as.factor(grouping), - col = c("black","blue","red","green","orange","yellow","grey","darkblue")) + col = c("black","blue","red","darkgreen","orange","purple","grey","darkblue")) } dev.off() @@ -485,9 +483,9 @@ ensemble<-function(a,b,name,threshold) d <- c() for(i in 1:dim(temp4)[1]){d[i] <- paste(temp4[i,c(1:3)],collapse="_")} rownames(c) <- d - chi_sam(t(c)) - - + if(dim(t(c))[2]>1) + { + chi_sam(t(c),seuil) #NMDS @@ -519,6 +517,6 @@ ensemble<-function(a,b,name,threshold) print(graph) dev.off() + } } - } -- GitLab From 46bbbd2e001b8d713b0ef8a5fd0f2b939e33218e Mon Sep 17 00:00:00 2001 From: Samuel Mondy <samuel.mondy@inra.fr> Date: Fri, 13 Dec 2019 12:44:38 +0100 Subject: [PATCH 2/2] Update R_amplicon_analysis -- GitLab