Skip to content
Snippets Groups Projects
Commit 536f7819 authored by your name's avatar your name
Browse files

all pairwise comparisons primates+bats

parent e4af6aa7
No related branches found
No related tags found
No related merge requests found
Showing with 614 additions and 118 deletions
......@@ -20,11 +20,15 @@
\begin{document}
\maketitle
\tableofcontents
\newpage
\section{Files manipulations}
I will compare Jeanette results to DGINN results, on the SAME alignment.
I will compare Janet Young's results to DGINN results, on the SAME alignment.
\subsection{Read Jeanette table}
\subsection{Read Janet Young's table}
<<>>=
tab<-read.delim("/home/adminmarie/Documents/CIRI_BIBS_projects/2020_05_Etienne_covid/data/COVID_PAMLresults_332hits_plusBatScreens_2020_Apr14.csv",
......@@ -74,7 +78,7 @@ tab[,1:10][(tab$Gene.name %in% unique(dginn$Gene))==F,]
# Remplacement manuel par
as.character(unique(dginn$Gene)[(unique(dginn$Gene) %in% tab$Gene.name)==F])
# dans le tableau de Jeanette
# dans le tableau de Janet
val_remp=as.character(unique(dginn$Gene)[(unique(dginn$Gene) %in% tab$Gene.name)==F])
......@@ -85,7 +89,7 @@ tab$Gene.name[158]<-val_remp
sum(unique(dginn$Gene) %in% unique(tab$Gene.name))
@
\subsubsection{new columns}
\subsubsection{New columns}
<<>>=
......@@ -121,16 +125,18 @@ tab<-merge(tab, tmp, by="Gene.name")
@
\subsubsection{Write new table}
\subsection{Write new table}
<<>>=
write.table(tab,
"COVID_PAMLresults_332hits_plusBatScreens_plusDGINN_20200506.txt",
row.names=F, quote=F, sep="\t")
@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Comparisons Primates}
\subsection{Figure}
\subsection{DGINN results on Janet Young's alignments (DGINN-Young-primate) VS Janet Young's results}
\subsubsection{Omega}
......@@ -138,7 +144,7 @@ Comparaison des Omega: colonne L "whole.gene.dN.dS.model.0" VS colonne "omega" d
<<omegaM7M8>>=
plot(tab$whole.gene.dN.dS.model.0, tab$Omega_PamlM7M8,
xlab="Omega tabJeanette", ylab="Omega DGINN")
xlab="Omega Young-primate", ylab="Omega DGINN-Young-primate")
@
Quels sont les 2 gènes qui s'écartent de la bissectrice?
<<>>=
......@@ -155,7 +161,7 @@ Cette fois, je compare la colonne R "pVal.M8vsM7", à la colonne "PValue" + lign
<<pvalM7M8>>=
plot(tab$pVal.M8vsM7, tab$PValue_PamlM7M8, pch=20,
xlab="p-value tabJeanette", ylab="p-value DGINN", main="M7vM8 Paml")
xlab="p-value Young-primate", ylab="p-value DGINN-Young-primate", main="M7vM8 Paml")
points(tab$pVal.M8vsM7[tab$pVal.M8vsM7>0.05 & tab$PValue_PamlM7M8<0.05],
tab$PValue_PamlM7M8[tab$pVal.M8vsM7>0.05 & tab$PValue_PamlM7M8<0.05],
col="red", pch=20)
......@@ -163,8 +169,8 @@ points(tab$pVal.M8vsM7[tab$pVal.M8vsM7<0.05 & tab$PValue_PamlM7M8>0.05],
tab$PValue_PamlM7M8[tab$pVal.M8vsM7<0.05 & tab$PValue_PamlM7M8>0.05],
col="green", pch=20)
legend("topleft", c("<0.05 in PamlM7M8 but >0.05 in Jeanette M8vsM7",
"<0.05 in Jeanette M8vsM7 but >0.05 in PamlM7M8"),
legend("topleft", c("<0.05 in DGINN-Young-primate PamlM7M8 but >0.05 in Young M8vsM7",
"<0.05 in Young M8vsM7 but >0.05 in DGINN-Young-primate PamlM7M8"),
pch=20, col=c("red", "green"))
@
......@@ -189,11 +195,10 @@ tab[tab$Gene.name=="CIT",1:20]
\subsubsection{Concordance est méthodes}
\subsubsection{Concordance des méthodes}
Est-ce que les gènes avec une faible p-value sont détecté par 1,2,3,4 ou 5 méthodes en général?
<<stripchart>>=
nontab<-tab[tab$pVal.M8vsM7>=0.05,c("Gene.name","PosSel_PamlM1M2", "PosSel_PamlM7M8","PosSel_BppM1M2",
"PosSel_BppM7M8", "PosSel_BUSTED")]
......@@ -209,18 +214,139 @@ oui<-apply(ouitab, 1, function(x) sum(x=="Y"))
stripchart(x=list(oui, non), method="jitter", jitter=0.2,
vertical=T, pch=20, cex=0.5,
group.names=c("Yes Jeanette", "No Jeanette"),
group.names=c("Yes Young", "No Young"),
ylab="Nb YES from dginn")
@
\subsection{Résultats Cooper-primate VS Young-primate}
\subsubsection{How many genes in the Cooper-primate columns?}
<<>>=
# Temporary table with necessary columns
tmp<-tab[,c("Gene.name", "whole.gene.dN.dS.model.0", "pVal.M8vsM7",
"cooper.primates.Gene", "cooper.primates.Average_dNdS",
"cooper.primates.M7.M8_p_value")]
dim(tmp)
# Lines with values in the cooper Gene names column
dim(tmp[tmp$cooper.primates.Gene!="",])
%\subsubsection{Comparaison des codons?}
# Line with values (no NA) in the Cooper dNdS column
sum(is.na(tmp$cooper.primates.Average_dNdS)==F)
# Line with values (no NA) in the Cooper pvalue column
sum(is.na(tmp$cooper.primates.M7.M8_p_value)==F)
@
\subsubsection{Omega}
Comparaison des Omega: colonne L "whole.gene.dN.dS.model.0" VS colonne "cooper.primates.Average\_dNdS"
<<omegaM7M8coop>>=
plot(tab$whole.gene.dN.dS.model.0, tab$cooper.primates.Average_dNdS,
xlab="Omega Young-primate", ylab="Omega Cooper-primate")
@
\subsubsection{pvalues pour M7M8}
Cette fois, je compare la colonne R "pVal.M8vsM7", à la colonne cooper.primates.M7.M8\_p\_value (p-value de l'analyse de Cooper).
<<pvalM7M8coop>>=
plot(tab$pVal.M8vsM7, tab$cooper.primates.M7.M8_p_value, pch=20,
xlab="p-value Young", ylab="p-value Cooper-primate", main="M7vM8 Paml-primate")
points(tab$pVal.M8vsM7[tab$pVal.M8vsM7>0.05 & tab$cooper.primates.M7.M8_p_value<0.05],
tab$cooper.primates.M7.M8_p_value[tab$pVal.M8vsM7>0.05 & tab$cooper.primates.M7.M8_p_value<0.05],
col="red", pch=20)
points(tab$pVal.M8vsM7[tab$pVal.M8vsM7<0.05 & tab$cooper.primates.M7.M8_p_value>0.05],
tab$cooper.primates.M7.M8_p_value[tab$pVal.M8vsM7<0.05 & tab$cooper.primates.M7.M8_p_value>0.05],
col="green", pch=20)
legend("topleft", c("<0.05 in Cooper PamlM7M8 but >0.05 in Young M8vsM7",
"<0.05 in Young M8vsM7 but >0.05 in Cooper PamlM7M8"),
pch=20, col=c("red", "green"))
@
\subsection{Résultats DGINN sur alignement de Janet-Young (DGINN-Young-primate) VS Cooper-primates}
\subsubsection{Omega}
Comparaison des Omega: colonne colonne "cooper.primates.Average\_dNdS" VS omega de DGINN.
<<omegaM7M8comp3>>=
plot(tab$Omega_PamlM7M8, tab$cooper.primates.Average_dNdS,
xlab="Omega DGINN-Young-primate", ylab="Omega Cooper-primate")
@
\subsubsection{pvalues pour M7M8}
Cette fois, je compare la colonne R "pVal.M8vsM7", à la colonne "PValue" + ligne "PamlM7M8", pour la sortie de dginn.
<<pvalM7M8comp3>>=
plot(tab$PValue_PamlM7M8, tab$cooper.primates.M7.M8_p_value, pch=20,
xlab="p-value DGINN-Young-primate", ylab="p-value Cooper-primate", main="M7vM8 Paml")
points(tab$PValue_PamlM7M8[tab$PValue_PamlM7M8>0.05 & tab$cooper.primates.M7.M8_p_value<0.05],
tab$cooper.primates.M7.M8_p_value[tab$PValue_PamlM7M8>0.05 & tab$cooper.primates.M7.M8_p_value<0.05],
col="red", pch=20)
points(tab$PValue_PamlM7M8[tab$PValue_PamlM7M8<0.05 & tab$cooper.primates.M7.M8_p_value>0.05],
tab$cooper.primates.M7.M8_p_value[tab$PValue_PamlM7M8<0.05 & tab$cooper.primates.M7.M8_p_value>0.05],
col="green", pch=20)
legend("topleft", c("<0.05 in Cooper-primate PamlM7M8 but >0.05 in DGINN-Young-primate M8vsM7",
"<0.05 in DGINN-Young-primate M8vsM7 but >0.05 in Cooper-primate PamlM7M8"),
pch=20, col=c("red", "green"))
@
\subsection{Overlap}
I will draw a venn diagramm for the positive genes in the 3 analyses.
\subsubsection{Library and subtable}
<<sub>>=
library(VennDiagram)
# keeps only genes analysed in all 3 experiments
tmp<-na.omit(tab[,c("Gene.name", "pVal.M8vsM7", "cooper.primates.M7.M8_p_value",
"PosSel_PamlM7M8", "PValue_PamlM7M8")])
dim(tmp)
@
Il reste 186 gènes
<<vennprimate>>=
area1dginn<-sum(tmp$PosSel_PamlM7M8=="Y")
area2jean<-sum(tmp$pVal.M8vsM7<0.05)
area3coop<-sum(tmp$cooper.primates.M7.M8_p_val<0.05, na.rm=T)
n12<-sum(tmp$PosSel_PamlM7M8=="Y" & tmp$pVal.M8vsM7<0.05)
n23<-sum(tmp$pVal.M8vsM7<0.05 & tmp$cooper.primates.M7.M8_p_val<0.05, na.rm=T)
n13<-sum(tmp$PosSel_PamlM7M8=="Y" & tmp$cooper.primates.M7.M8_p_val<0.05, na.rm=T)
n123<-sum(tmp$PosSel_PamlM7M8=="Y" & tmp$pVal.M8vsM7<0.05 &
tmp$cooper.primates.M7.M8_p_val<0.05, na.rm=T)
draw.triple.venn(area1dginn, area2jean, area3coop,
n12, n23, n13, n123,
category=c("DGINN-Young-primate", "Young-primate", "Cooper-primate"))
@
%\subsection{Comparaison des codons?}
%Subtable with lines with both methods showing positive selection.
<<eval=FALSE, echo=FALSE>>=
%<<selec>>=
#cas ou selection + dans les 2 cas
sel<-na.omit(tab[(tab$pVal.M8vsM7<0.05 & tab$PValue_PamlM7M8<0.05),c("Gene.name", "pVal.M8vsM7", "PValue_PamlM7M8", "whole.gene.dN.dS.model.0", "Omega_PamlM7M8", "Number.of.codons.with.BEB....0.9", "Codons.under.positive.selection..BEB..0.9...alignment.position.", "NbSites_PamlM7M8","PSS_PamlM7M8")])
......@@ -228,10 +354,10 @@ dim(sel)
head(sel)
@
<<nsites, eval=FALSE, echo=FALSE>>=
%<<nsites>>=
plot(sel$Number.of.codons.with.BEB....0.9, sel$NbSites_PamlM7M8)
# toujours plus de codon dans la version de jeanette
# toujours plus de codon dans la version de janet Young
listdginn<-sapply(sel$PSS_PamlM7M8, function(x){
tmp<-strsplit(as.character(x), split=",")[[1]]
......@@ -247,7 +373,7 @@ listjanet<-sapply(sel$Codons.under.positive.selection..BEB..0.9...alignment.posi
tmp<-strsplit(as.character(x), split=",")[[1]]
tmp2<-sapply(tmp, function(x) strsplit(as.character(x), split="_")[[1]][1])
tmp2<-unlist(tmp2)
names(tmp2)<-rep("jeanette", length(tmp2))
names(tmp2)<-rep("young", length(tmp2))
return(unlist(tmp2))
})
......@@ -259,11 +385,23 @@ listjoined<-mapply(c, listdginn, listjanet, SIMPLIFY=FALSE)
\section{Bats DGINN vs Bats Cooper}
\subsection{Read DGINN table, Bats}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Bats Comparisons}
\subsection{Add DGINN results for Bats}
Lecture du tableau.
......@@ -277,25 +415,15 @@ length(unique(tab$cooper.batsGene))
table(unique(tab$cooper.batsGene) %in% unique(dginnbats$Gene))
@
Quels sont les gènes du tableau Cooper qui ne sont pas dans la sortie de DGINN
Which genes in the Cooper table are not in the gene output?
<<>>=
unique(tab$cooper.batsGene)[unique(tab$cooper.batsGene) %in% unique(dginnbats$Gene)==F]
@
Fusionner les tableaux
Merge tables:
<<last>>=
<<bats>>=
add_col<-function(dginnfile=dginnbats, method="PamlM1M2", prefixe="bats_"){
......@@ -320,59 +448,144 @@ tab<-add_col(dginnfile=dginnbats, method="BppM1M2", prefixe="bats_")
tab<-add_col(dginnfile=dginnbats, method="BppM7M8", prefixe="bats_")
head(tab[, c("cooper.batsGene", "cooper.batsAverage_dNdS","cooper.batsM7.M8_p_value", "bats_Omega_PamlM7M8","bats_PValue_PamlM7M8")])
head(na.omit(tab[, c("cooper.batsGene", "cooper.batsAverage_dNdS","cooper.batsM7.M8_p_value", "bats_Omega_PamlM7M8","bats_PValue_PamlM7M8")]))
# Manip pour la colonne BUSTED
tmp<-dginn[dginn$Method=="BUSTED",c("Gene", "Omega", "PosSel", "PValue")]
names(tmp)<-c("cooper.batsGene", "bats_Omega_BUSTED", "bats_PosSel_BUSTED", "bats_PValue_BUSTED")
tab<-merge(tab, tmp, by="cooper.batsGene")
tab<-merge(tab, tmp, all.x=T, by="cooper.batsGene")
tmp<-dginn[dginn$Method=="MEME",c("Gene", "NbSites", "PSS")]
names(tmp)<-c("cooper.batsGene", "bats_NbSites_MEME", "bats_PSS_MEME")
tab<-merge(tab, tmp, by="cooper.batsGene")
tab<-merge(tab, tmp, all.x=T, by="cooper.batsGene")
dim(tab)
@
\subsection{Comparaisons et figures}
Faire un sous tableau.
<<fig>>=
tabbats<-na.omit(tab[, c("cooper.batsGene", "cooper.batsAverage_dNdS","cooper.batsM7.M8_p_value", "bats_Omega_PamlM7M8","bats_PValue_PamlM7M8")])
dim(tabbats)
@
\subsection{Cooper-bats results vs DGINN-bats results}
Comparaison des Omega
\subsubsection{Omega}
<<omegaM7M8bats>>=
plot(tab$cooper.batsAverage_dNdS, tab$bats_Omega_PamlM7M8,
xlab="Omega Cooper", ylab="Omega DGINN")
xlab="Omega Cooper-bats", ylab="Omega DGINN-bats")
@
pvalues pour M7M8
\subsubsection{pvalues pour M7M8}
<<pvalM7M8bats>>=
plot(tab$cooper.batsM7.M8_p_value, tab$bats_PValue_PamlM7M8, pch=20,
xlab="p-value tabJeanette", ylab="p-value DGINN", main="M7vM8 Paml")
xlab="p-value Cooper-bats", ylab="p-value DGINN-bats", main="M7vM8 Paml")
points(tab$cooper.batsM7.M8_p_value[tab$cooper.batsM7.M8_p_value>0.05 & tab$bats_PValue_PamlM7M8<0.05],
tab$bats_PValue_PamlM7M8[tab$cooper.batsM7.M8_p_value>0.05 & tab$bats_PValue_PamlM7M8<0.05],
col="red", pch=20)
points(tab$cooper.batsM7.M8_p_value[tab$cooper.batsM7.M8_p_value<0.05 & tab$bats_PValue_PamlM7M8>0.05],
points(tab$cooper.batsM7.M8_p_value[tab$cooper.batsM7.M8_p_value<0.05 & tab$bats_PValue_PamlM7M8>0.05],
tab$bats_PValue_PamlM7M8[tab$cooper.batsM7.M8_p_value<0.05 & tab$bats_PValue_PamlM7M8>0.05],
col="green", pch=20)
legend("topleft", c("<0.05 in PamlM7M8 but >0.05 in Jeanette M8vsM7",
"<0.05 in Jeanette M8vsM7 but >0.05 in PamlM7M8"),
legend("topleft", c("<0.05 in DGINN-bats but >0.05 in Cooper-bats",
"<0.05 in Cooper-bats but >0.05 in DGINN-bats"),
pch=20, col=c("red", "green"))
@
\subsection{Comparaison Cooper-Hawkins}
\subsubsection{pvalues pour M7M8}
<<pvalM7M8comp2>>=
plot(tab$cooper.batsM7.M8_p_value, tab$hawkins_Positive.Selection..M8vM8a.p.value,
pch=20, xlab="p-value Cooper-bats", ylab="p-value hawkins-bats", main="M7vM8 Paml")
points(tab$cooper.batsM7.M8_p_value[tab$cooper.batsM7.M8_p_value>0.05 &
tab$hawkins_Positive.Selection..M8vM8a.p.value<0.05],
tab$hawkins_Positive.Selection..M8vM8a.p.value[tab$cooper.batsM7.M8_p_value>0.05 &
tab$hawkins_Positive.Selection..M8vM8a.p.value<0.05],
col="red", pch=20)
points(tab$cooper.batsM7.M8_p_value[tab$cooper.batsM7.M8_p_value<0.05 &
tab$hawkins_Positive.Selection..M8vM8a.p.value>0.05],
tab$hawkins_Positive.Selection..M8vM8a.p.value[tab$cooper.batsM7.M8_p_value<0.05 &
tab$hawkins_Positive.Selection..M8vM8a.p.value>0.05],
col="green", pch=20)
legend("topleft", c("<0.05 in Hawkins but >0.05 in Cooper",
"<0.05 in Cooper but >0.05 in Hawkins"),
pch=20, col=c("red", "green"))
@
\subsection{Comparaison dginn-Hawkins}
<<pvalM7M8compautre>>=
plot(tab$hawkins_Positive.Selection..M8vM8a.p.value, tab$bats_PValue_PamlM7M8,
pch=20, xlab="p-value hawkins-bats", ylab="p-value DGINN-bats", main="M7vM8 Paml")
points(tab$hawkins_Positive.Selection..M8vM8a.p.value[tab$hawkins_Positive.Selection..M8vM8a.p.value>0.05 &
tab$bats_PValue_PamlM7M8<0.05],
tab$bats_PValue_PamlM7M8[tab$hawkins_Positive.Selection..M8vM8a.p.value>0.05 &
tab$bats_PValue_PamlM7M8<0.05], col="red", pch=20)
points(tab$hawkins_Positive.Selection..M8vM8a.p.value[tab$hawkins_Positive.Selection..M8vM8a.p.value<0.05 &
tab$bats_PValue_PamlM7M8>0.05],
tab$bats_PValue_PamlM7M8[tab$hawkins_Positive.Selection..M8vM8a.p.value<0.05 &
tab$bats_PValue_PamlM7M8>0.05], col="green", pch=20)
legend("topleft", c("<0.05 in DGINN-bats but >0.05 in Hawkins",
"<0.05 in Hawkinsbut >0.05 in DGINN-bats"),
pch=20, col=c("red", "green"))
@
Apparemment, ces comparaisons n'ont pas trop de sens.
\subsection{Diagramme de Venn}
I will draw a venn diagramm for the positive genes in the 3 analyses.
\subsubsection{subtab}
<<subbats>>=
tmp<-na.omit(tab[,c("Gene.name", "bats_PValue_PamlM7M8", "hawkins_Positive.Selection..M8vM8a.p.value", "cooper.batsM7.M8_p_value")])
dim(tmp)
@
154 genes (present in the 3 experiments)
\subsubsection{figure}
<<vennbats>>=
area1dginn<-sum(tmp$bats_PValue_PamlM7M8<0.05, na.rm=T)
area2hawk<-sum(tmp$hawkins_Positive.Selection..M8vM8a.p.value<0.05, na.rm=T)
area3coop<-sum(tmp$cooper.batsM7.M8_p_value<0.05, na.rm=T)
n12<-sum(tmp$bats_PValue_PamlM7M8<0.05 & tmp$hawkins_Positive.Selection..M8vM8a.p.value<0.05, na.rm=T)
n23<-sum(tmp$hawkins_Positive.Selection..M8vM8a.p.value<0.05 & tmp$cooper.batsM7.M8_p_value<0.05, na.rm=T)
n13<-sum(tmp$bats_PValue_PamlM7M8<0.05 & tmp$cooper.batsM7.M8_p_value<0.05, na.rm=T)
n123<-sum(tmp$bats_PValue_PamlM7M8<0.05 & tmp$hawkins_Positive.Selection..M8vM8a.p.value<0.05 & tmp$cooper.batsM7.M8_p_value<0.05, na.rm=T)
draw.triple.venn(area1dginn, area2hawk, area3coop, n12, n23, n13, n123, category=c("dginn", "hawkins", "cooper"))
@
\end{document}
......
No preview for this file type
This diff is collapsed.
File added
No preview for this file type
No preview for this file type
File added
File added
No preview for this file type
No preview for this file type
File added
File added
File added
File added
No preview for this file type
File added
File added
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment