From f73d612815890a8d5540bd2dd3c7a58a12782a46 Mon Sep 17 00:00:00 2001
From: Samuel Mondy <samuel.mondy@inra.fr>
Date: Fri, 13 Dec 2019 10:30:07 +0100
Subject: [PATCH 1/2] Update R_amplicon_analysis

---
 R_amplicon_analysis | 280 ++++++++++++++++++++++----------------------
 1 file changed, 139 insertions(+), 141 deletions(-)

diff --git a/R_amplicon_analysis b/R_amplicon_analysis
index 04f8cce..f6183f1 100644
--- a/R_amplicon_analysis
+++ b/R_amplicon_analysis
@@ -1,4 +1,3 @@
-
 library("stats")
 library("graphics")
 library("grDevices utils")
@@ -27,9 +26,7 @@ library("caTools")
 library("tcltk")
 library("vegan")
 library("ggplot2")
-
-
-
+library("plyr")
 
 
 Ten_max<-function(ref,index=20)
@@ -90,49 +87,49 @@ add.alpha <- function(col, alpha=0.3){
 
 ###chi square test with only 2 variables
 
-chi_sam<-function(b)
-#########################################
+chi_sam<-function(b,seuil=0.05)
+  #########################################
 #########################################
 {
-t<-apply(b,1,sum)
-selec<-which(t>0)
-b<-b[selec,]
-t<-apply(b,2,sum)
-selec<-which(t>0)
-b<-b[,selec]
-y<-chisq.test(b)
-print(y)
-print(y$expected)
-print(y$observed)
-print (y$p.value)
-if(y$p.value<0.05)
-{
-if(dim(b)[2]>2)
-{
-z<-combn(dim(b)[2],2)
-for(i in 1:dim(z)[2])
-{
-r<-b[,c(z[1,i],z[2,i])]
-s<-apply(r,1,sum)
-selec<-which(s>0)
-r<-r[selec,]
-x<-chisq.test(r)
-#print(paste(colnames(b)[z[1,i]],colnames(b)[z[2,i]],sep=" vs "))
-#print(x$expected)
-#print(x$observed)
-if(x$p.value<(0.05/dim(z)[2]))
-{
-print(paste(colnames(b)[z[1,i]],colnames(b)[z[2,i]],sep=" vs "))
-print(x$p.value)
-}
-}
-print(paste("corrected p-value ",(0.05/dim(z)[2]),sep=""))
-}
-else
-{
-print (y$p.value)
-}
-}
+  t<-apply(b,1,sum)
+  selec<-which(t>0)
+  b<-b[selec,]
+  t<-apply(b,2,sum)
+  selec<-which(t>0)
+  b<-b[,selec]
+  y<-chisq.test(b)
+  print(y)
+  print(y$expected)
+  print(y$observed)
+  print (y$p.value)
+  if(y$p.value<seuil)
+  {
+    if(dim(b)[2]>2)
+    {
+      z<-combn(dim(b)[2],2)
+      for(i in 1:dim(z)[2])
+      {
+        r<-b[,c(z[1,i],z[2,i])]
+        s<-apply(r,1,sum)
+        selec<-which(s>0)
+        r<-r[selec,]
+        x<-chisq.test(r)
+        #print(paste(colnames(b)[z[1,i]],colnames(b)[z[2,i]],sep=" vs "))
+        #print(x$expected)
+        #print(x$observed)
+        if(x$p.value<(seuil/dim(z)[2]))
+        {
+          print(paste(colnames(b)[z[1,i]],colnames(b)[z[2,i]],sep=" vs "))
+          print(x$p.value)
+        }
+      }
+      print(paste("corrected p-value ",(seuil/dim(z)[2]),sep=""))
+    }
+    else
+    {
+      print (y$p.value)
+    }
+  }
 }
 
 
@@ -149,22 +146,25 @@ print (y$p.value)
 #threshold is the threshold defined above in the Ten_max() function.
 
 
-ensemble<-function(a,b,name,threshold)
+ensemble<-function(a,b,name,threshold,seuil=0.05)
 {
   sum_a<-apply(a,2,sum)
   if(length(which(sum_a==0))>0)
   {
-  a<-a[,-which(sum_a==0)] 
+    a<-a[,-which(sum_a==0)] 
   }
   #heatmap
   if(dim(a)[2]<threshold)
   {
     aprime<-t(a)
-  } else {
+  }
+  else
+  {
     aprime<-t(Ten_max(t(a),threshold))
   }
+  
   write.table(aprime, paste(Sys.Date(),name,"Ten_max",threshold,".txt",sep="_"), row.names=T, sep="\t",quote=F)
-
+  
   logaprime <- log2(aprime)
   logaprime[logaprime=="-Inf"] <- c(0)
   bprime <- data.frame(unclass(b))
@@ -205,12 +205,7 @@ ensemble<-function(a,b,name,threshold)
   
   #dev.off()
   
-  if(dim(a)[2]<threshold)
-  {
-    aprime<-t(Ten_max(t(a),dim(a)[2]))
-  } else {
-    aprime<-t(Ten_max(t(a),threshold))
-  }
+
   v<-apply(aprime,1,sum)
   w<-apply(a,1,sum)
   pourseq<-mean(v/w)*100
@@ -221,36 +216,44 @@ ensemble<-function(a,b,name,threshold)
   k<- dim(b)[2]
   
   ########################################
-  #en foncion de 1 parametre
+  #en fonction de 1 parametre
   ########################################
   
-  for(x in 2:k)
+  for(x in 1:k)
   {
     z=1
     y=1
     pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[x],y,".pdf",sep="_"),width=15,height=15)
-    par(mfrow=c(3,2))
+    par(mfrow=c(3,2), mar=c(7,4,6,4)+2)
     for(m in 1:dim(aprime)[2])
     {
-      test <- kruskal.test(aprime[,m]~bprime[,x])
-      if(test$p.value<0.05)
-      {
-      if(z%%6==0)
-      {
-        boxplot(aprime[,m]~bprime[,x],main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3)  
-        dev.off()
-        y=y+1
-        z=1
-        pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[x],y,".pdf",sep="_"),width=30,height=15)
-        par(mfrow=c(3,2))
-      }
+      condition <- length(which(!is.na(unique(bprime[,x]))==1))
+      if(condition[1]>1)
+        {
+        test <- kruskal.test(aprime[,m]~bprime[,x])
+        if(test$p.value<seuil)
+          {
+          if(z%%6==0)
+            {
+            boxplot(aprime[,m]~bprime[,x],main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3,las=2)  
+            dev.off()
+            y=y+1
+            z=1
+            pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[x],y,".pdf",sep="_"),width=30,height=15)
+            par(mfrow=c(3,2), mar=c(7,4,6,4)+2)
+            }
+          else
+            {
+            boxplot(aprime[,m]~bprime[,x],main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3,las=2)  
+            z <- z+1
+            }
+          }
+        }
       else
-      {
-        boxplot(aprime[,m]~bprime[,x],main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3)  
-        z <- z+1
-      }
+        {
+        print("kruskal not allowed")
+        }
       }
-    }
     dev.off()
     
     
@@ -269,16 +272,16 @@ ensemble<-function(a,b,name,threshold)
     d <- c()
     for(i in 1:dim(c)[1]){d[i] <- paste(c[i,c(1)],collapse="_")}
     rownames(temp) <- d
-    chi_sam(t(temp))
-    
-    
+    if(dim(t(temp))[2]>1)
+    {
+    chi_sam(t(temp),seuil)
     pdf(file=paste("BGA",Sys.Date(),name,colnames(bprime)[x],".pdf",sep="_"),width=15,height=15)
     {
       bga1 <- bca(x = datas, fac = as.factor(bprime[,x]), scannf = FALSE, nf = 5)
       
       s.class(datas$li,
               fac = as.factor(bprime[,x]),  # colorer par groupes
-              col = c("black","blue","red","green","orange","yellow","grey","darkblue"))
+              col = c("black","blue","red","darkgreen","orange","purple","grey","darkblue"))
     }
     dev.off()
     
@@ -316,7 +319,7 @@ ensemble<-function(a,b,name,threshold)
     
     print(graph)
     dev.off()
-    
+    }
   }
   
   
@@ -324,46 +327,43 @@ ensemble<-function(a,b,name,threshold)
   #########################################################
   #en foncion de 2 parametres
   #########################################################
-  comb <- combn(2:k,2)
+  comb <- combn(1:k,2)
   for(x in 1:dim(comb)[2])
   {
     temp1 <- comb[1,x]
     temp2 <- comb[2,x]
+    print(paste(colnames(b)[temp1],colnames(b)[temp2],sep="_"))
     {
-      if(dim(a)[2]<threshold)
-      {
-        aprime<-t(Ten_max(t(a),dim(a)[2]))
-      }
-      else
-      {
-        aprime<-t(Ten_max(t(a),threshold))
-      }
       grouping <- as.factor(paste(b[,temp1],b[,temp2], sep="_"))
       z=1
       y=1
       pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[temp1],colnames(b)[temp2],y,".pdf",sep="_"),width=15,height=15)
-      par(mfrow=c(3,2))
+      par(mfrow=c(3,2), mar=c(7,4,6,4)+2)
       for(m in 1:dim(aprime)[2])
       {
-      if(test$p.value<0.05)
-       {
-      
-        test <- kruskal.test(aprime[,m]~grouping)
-        if(z%%6==0)
+        condition <- length(which(!is.na(unique(grouping))==1))
+        if(condition[1]>1)
         {
-          boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3)  
-          dev.off()
-          y=y+1
-          z=1
-          pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[temp1],colnames(b)[temp2],y,".pdf",sep="_"),width=30,height=15)
-          par(mfrow=c(3,2))
-        }
-        else
+        test<-kruskal.test(aprime[,m]~grouping)
+        if(test$p.value<seuil)
         {
-          boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3)  
-          z <- z+1
+          
+          if(z%%6==0)
+          {
+            boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3,las=2)  
+            dev.off()
+            y=y+1
+            z=1
+            pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[temp1],colnames(b)[temp2],y,".pdf",sep="_"),width=30,height=15)
+            par(mfrow=c(3,2), mar=c(7,4,6,4)+2)
+          }
+          else
+          {
+            boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3,las=2)  
+            z <- z+1
+          }
+        }
         }
-	}
       }
       dev.off()
       
@@ -373,8 +373,8 @@ ensemble<-function(a,b,name,threshold)
         
         s.class(datas$li,
                 fac = grouping,  
-                col = c("black","blue","red","green","orange","yellow","grey","darkblue"))
-      }
+                col = c("black","blue","red","darkgreen","orange","purple","grey","darkblue"))
+                }
       dev.off()
       
       c<-aggregate(a, list(a=b[,temp1],b=b[,temp2]), FUN=sum)
@@ -384,9 +384,9 @@ ensemble<-function(a,b,name,threshold)
       d <- c()
       for(i in 1:dim(temp3)[1]){d[i] <- paste(temp3[i,c(1:2)],collapse="_")}
       rownames(c) <- d
-      chi_sam(t(c))
-      
-      
+      if(dim(t(c))[2]>1)
+      {
+        chi_sam(t(c),seuil)
       #NMDS
       
       
@@ -419,51 +419,49 @@ ensemble<-function(a,b,name,threshold)
       
     }
     dev.off()
+    }
   }
   
   
   
-  
   #########################################################
   #en foncion de 3 parametres
   #########################################################
-  comb <- combn(2:k,3)
+  comb <- combn(1:k,3)
   for(x in 1:dim(comb)[2])
   {
     temp1 <- comb[1,x]
     temp2 <- comb[2,x]
     temp3 <- comb[3,x]
-    if(dim(a)[2]<20)
-    {
-      aprime<-t(Ten_max(t(a),dim(a)[2]))
-    } else {
-      aprime<-t(Ten_max(t(a),threshold))
-    }
     grouping <- as.factor(paste(b[,temp1],b[,temp2],b[,temp3], sep="_"))
     
     z=1
     y=1
     pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[temp1],colnames(b)[temp2],colnames(b)[temp3],y,".pdf",sep="_"),width=15,height=15)
-    par(mfrow=c(3,2))
+    par(mfrow=c(3,2), mar=c(7,4,6,4)+2)
     for(m in 1:dim(aprime)[2])
     {
-      test <- kruskal.test(aprime[,m]~grouping)
-      if(test$p.value<0.05)
+      condition <- length(which(!is.na(unique(bprime[,x]))==1))
+      if(condition[1]>1)
+      {
+        test <- kruskal.test(aprime[,m]~grouping)
+      if(test$p.value<seuil)
       {
-      	if(z%%6==0)
-      	{
-        	boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3)  
-        	dev.off()
-        	y=y+1
-        	z=1
-        	pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[temp1],colnames(b)[temp2],colnames(b)[temp3],y,".pdf",sep="_"),width=30,height=15)
-        	par(mfrow=c(3,2))
-      	}
-      	else
-      	{
-        	boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.4)  
-        	z <- z+1
-      	}
+        if(z%%6==0)
+        {
+          boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.3,las=2)  
+          dev.off()
+          y=y+1
+          z=1
+          pdf(file=paste("boxplot",Sys.Date(),name,colnames(b)[temp1],colnames(b)[temp2],colnames(b)[temp3],y,".pdf",sep="_"),width=30,height=15)
+          par(mfrow=c(3,2), mar=c(7,4,6,4)+2)
+        }
+        else
+        {
+          boxplot(aprime[,m]~grouping,main=paste(colnames(aprime)[m],"_kruskall_test=",formatC(test$p.value, format = "e", digits = 2),sep=""), cex.names=0.4, las=2)  
+          z <- z+1
+        }
+      }
       }
     }
     dev.off()
@@ -474,7 +472,7 @@ ensemble<-function(a,b,name,threshold)
       
       s.class(datas$li,
               fac = as.factor(grouping),  
-              col = c("black","blue","red","green","orange","yellow","grey","darkblue"))
+              col = c("black","blue","red","darkgreen","orange","purple","grey","darkblue"))
     }
     dev.off()
     
@@ -485,9 +483,9 @@ ensemble<-function(a,b,name,threshold)
     d <- c()
     for(i in 1:dim(temp4)[1]){d[i] <- paste(temp4[i,c(1:3)],collapse="_")}
     rownames(c) <- d
-    chi_sam(t(c))
-    
-    
+    if(dim(t(c))[2]>1)
+    {
+      chi_sam(t(c),seuil)
     #NMDS
     
     
@@ -519,6 +517,6 @@ ensemble<-function(a,b,name,threshold)
     print(graph)
     
     dev.off()
+    }
   }
-  
 }
-- 
GitLab


From 46bbbd2e001b8d713b0ef8a5fd0f2b939e33218e Mon Sep 17 00:00:00 2001
From: Samuel Mondy <samuel.mondy@inra.fr>
Date: Fri, 13 Dec 2019 12:44:38 +0100
Subject: [PATCH 2/2] Update R_amplicon_analysis

-- 
GitLab