Commit 536f7819 authored by your name's avatar your name
Browse files

all pairwise comparisons primates+bats

parent e4af6aa7
......@@ -20,11 +20,15 @@
\begin{document}
\maketitle
\tableofcontents
\newpage
\section{Files manipulations}
I will compare Jeanette results to DGINN results, on the SAME alignment.
I will compare Janet Young's results to DGINN results, on the SAME alignment.
\subsection{Read Jeanette table}
\subsection{Read Janet Young's table}
<<>>=
tab<-read.delim("/home/adminmarie/Documents/CIRI_BIBS_projects/2020_05_Etienne_covid/data/COVID_PAMLresults_332hits_plusBatScreens_2020_Apr14.csv",
......@@ -74,7 +78,7 @@ tab[,1:10][(tab$Gene.name %in% unique(dginn$Gene))==F,]
# Remplacement manuel par
as.character(unique(dginn$Gene)[(unique(dginn$Gene) %in% tab$Gene.name)==F])
# dans le tableau de Jeanette
# dans le tableau de Janet
val_remp=as.character(unique(dginn$Gene)[(unique(dginn$Gene) %in% tab$Gene.name)==F])
......@@ -85,7 +89,7 @@ tab$Gene.name[158]<-val_remp
sum(unique(dginn$Gene) %in% unique(tab$Gene.name))
@
\subsubsection{new columns}
\subsubsection{New columns}
<<>>=
......@@ -121,16 +125,18 @@ tab<-merge(tab, tmp, by="Gene.name")
@
\subsubsection{Write new table}
\subsection{Write new table}
<<>>=
write.table(tab,
"COVID_PAMLresults_332hits_plusBatScreens_plusDGINN_20200506.txt",
row.names=F, quote=F, sep="\t")
@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Comparisons Primates}
\subsection{Figure}
\subsection{DGINN results on Janet Young's alignments (DGINN-Young-primate) VS Janet Young's results}
\subsubsection{Omega}
......@@ -138,7 +144,7 @@ Comparaison des Omega: colonne L "whole.gene.dN.dS.model.0" VS colonne "omega" d
<<omegaM7M8>>=
plot(tab$whole.gene.dN.dS.model.0, tab$Omega_PamlM7M8,
xlab="Omega tabJeanette", ylab="Omega DGINN")
xlab="Omega Young-primate", ylab="Omega DGINN-Young-primate")
@
Quels sont les 2 gènes qui s'écartent de la bissectrice?
<<>>=
......@@ -155,7 +161,7 @@ Cette fois, je compare la colonne R "pVal.M8vsM7", à la colonne "PValue" + lign
<<pvalM7M8>>=
plot(tab$pVal.M8vsM7, tab$PValue_PamlM7M8, pch=20,
xlab="p-value tabJeanette", ylab="p-value DGINN", main="M7vM8 Paml")
xlab="p-value Young-primate", ylab="p-value DGINN-Young-primate", main="M7vM8 Paml")
points(tab$pVal.M8vsM7[tab$pVal.M8vsM7>0.05 & tab$PValue_PamlM7M8<0.05],
tab$PValue_PamlM7M8[tab$pVal.M8vsM7>0.05 & tab$PValue_PamlM7M8<0.05],
col="red", pch=20)
......@@ -163,8 +169,8 @@ points(tab$pVal.M8vsM7[tab$pVal.M8vsM7<0.05 & tab$PValue_PamlM7M8>0.05],
tab$PValue_PamlM7M8[tab$pVal.M8vsM7<0.05 & tab$PValue_PamlM7M8>0.05],
col="green", pch=20)
legend("topleft", c("<0.05 in PamlM7M8 but >0.05 in Jeanette M8vsM7",
"<0.05 in Jeanette M8vsM7 but >0.05 in PamlM7M8"),
legend("topleft", c("<0.05 in DGINN-Young-primate PamlM7M8 but >0.05 in Young M8vsM7",
"<0.05 in Young M8vsM7 but >0.05 in DGINN-Young-primate PamlM7M8"),
pch=20, col=c("red", "green"))
@
......@@ -189,11 +195,10 @@ tab[tab$Gene.name=="CIT",1:20]
\subsubsection{Concordance est méthodes}
\subsubsection{Concordance des méthodes}
Est-ce que les gènes avec une faible p-value sont détecté par 1,2,3,4 ou 5 méthodes en général?
<<stripchart>>=
nontab<-tab[tab$pVal.M8vsM7>=0.05,c("Gene.name","PosSel_PamlM1M2", "PosSel_PamlM7M8","PosSel_BppM1M2",
"PosSel_BppM7M8", "PosSel_BUSTED")]
......@@ -209,18 +214,139 @@ oui<-apply(ouitab, 1, function(x) sum(x=="Y"))
stripchart(x=list(oui, non), method="jitter", jitter=0.2,
vertical=T, pch=20, cex=0.5,
group.names=c("Yes Jeanette", "No Jeanette"),
group.names=c("Yes Young", "No Young"),
ylab="Nb YES from dginn")
@
\subsection{Résultats Cooper-primate VS Young-primate}
\subsubsection{How many genes in the Cooper-primate columns?}
<<>>=
# Temporary table with necessary columns
tmp<-tab[,c("Gene.name", "whole.gene.dN.dS.model.0", "pVal.M8vsM7",
"cooper.primates.Gene", "cooper.primates.Average_dNdS",
"cooper.primates.M7.M8_p_value")]
dim(tmp)
# Lines with values in the cooper Gene names column
dim(tmp[tmp$cooper.primates.Gene!="",])
%\subsubsection{Comparaison des codons?}
# Line with values (no NA) in the Cooper dNdS column
sum(is.na(tmp$cooper.primates.Average_dNdS)==F)
# Line with values (no NA) in the Cooper pvalue column
sum(is.na(tmp$cooper.primates.M7.M8_p_value)==F)
@
\subsubsection{Omega}
Comparaison des Omega: colonne L "whole.gene.dN.dS.model.0" VS colonne "cooper.primates.Average\_dNdS"
<<omegaM7M8coop>>=
plot(tab$whole.gene.dN.dS.model.0, tab$cooper.primates.Average_dNdS,
xlab="Omega Young-primate", ylab="Omega Cooper-primate")
@
\subsubsection{pvalues pour M7M8}
Cette fois, je compare la colonne R "pVal.M8vsM7", à la colonne cooper.primates.M7.M8\_p\_value (p-value de l'analyse de Cooper).
<<pvalM7M8coop>>=
plot(tab$pVal.M8vsM7, tab$cooper.primates.M7.M8_p_value, pch=20,
xlab="p-value Young", ylab="p-value Cooper-primate", main="M7vM8 Paml-primate")
points(tab$pVal.M8vsM7[tab$pVal.M8vsM7>0.05 & tab$cooper.primates.M7.M8_p_value<0.05],
tab$cooper.primates.M7.M8_p_value[tab$pVal.M8vsM7>0.05 & tab$cooper.primates.M7.M8_p_value<0.05],
col="red", pch=20)
points(tab$pVal.M8vsM7[tab$pVal.M8vsM7<0.05 & tab$cooper.primates.M7.M8_p_value>0.05],
tab$cooper.primates.M7.M8_p_value[tab$pVal.M8vsM7<0.05 & tab$cooper.primates.M7.M8_p_value>0.05],
col="green", pch=20)
legend("topleft", c("<0.05 in Cooper PamlM7M8 but >0.05 in Young M8vsM7",
"<0.05 in Young M8vsM7 but >0.05 in Cooper PamlM7M8"),
pch=20, col=c("red", "green"))
@
\subsection{Résultats DGINN sur alignement de Janet-Young (DGINN-Young-primate) VS Cooper-primates}
\subsubsection{Omega}
Comparaison des Omega: colonne colonne "cooper.primates.Average\_dNdS" VS omega de DGINN.
<<omegaM7M8comp3>>=
plot(tab$Omega_PamlM7M8, tab$cooper.primates.Average_dNdS,
xlab="Omega DGINN-Young-primate", ylab="Omega Cooper-primate")
@
\subsubsection{pvalues pour M7M8}
Cette fois, je compare la colonne R "pVal.M8vsM7", à la colonne "PValue" + ligne "PamlM7M8", pour la sortie de dginn.
<<pvalM7M8comp3>>=
plot(tab$PValue_PamlM7M8, tab$cooper.primates.M7.M8_p_value, pch=20,
xlab="p-value DGINN-Young-primate", ylab="p-value Cooper-primate", main="M7vM8 Paml")
points(tab$PValue_PamlM7M8[tab$PValue_PamlM7M8>0.05 & tab$cooper.primates.M7.M8_p_value<0.05],
tab$cooper.primates.M7.M8_p_value[tab$PValue_PamlM7M8>0.05 & tab$cooper.primates.M7.M8_p_value<0.05],
col="red", pch=20)
points(tab$PValue_PamlM7M8[tab$PValue_PamlM7M8<0.05 & tab$cooper.primates.M7.M8_p_value>0.05],
tab$cooper.primates.M7.M8_p_value[tab$PValue_PamlM7M8<0.05 & tab$cooper.primates.M7.M8_p_value>0.05],
col="green", pch=20)
legend("topleft", c("<0.05 in Cooper-primate PamlM7M8 but >0.05 in DGINN-Young-primate M8vsM7",
"<0.05 in DGINN-Young-primate M8vsM7 but >0.05 in Cooper-primate PamlM7M8"),
pch=20, col=c("red", "green"))
@
\subsection{Overlap}
I will draw a venn diagramm for the positive genes in the 3 analyses.
\subsubsection{Library and subtable}
<<sub>>=
library(VennDiagram)
# keeps only genes analysed in all 3 experiments
tmp<-na.omit(tab[,c("Gene.name", "pVal.M8vsM7", "cooper.primates.M7.M8_p_value",
"PosSel_PamlM7M8", "PValue_PamlM7M8")])
dim(tmp)
@
Il reste 186 gènes
<<vennprimate>>=
area1dginn<-sum(tmp$PosSel_PamlM7M8=="Y")
area2jean<-sum(tmp$pVal.M8vsM7<0.05)
area3coop<-sum(tmp$cooper.primates.M7.M8_p_val<0.05, na.rm=T)
n12<-sum(tmp$PosSel_PamlM7M8=="Y" & tmp$pVal.M8vsM7<0.05)
n23<-sum(tmp$pVal.M8vsM7<0.05 & tmp$cooper.primates.M7.M8_p_val<0.05, na.rm=T)
n13<-sum(tmp$PosSel_PamlM7M8=="Y" & tmp$cooper.primates.M7.M8_p_val<0.05, na.rm=T)
n123<-sum(tmp$PosSel_PamlM7M8=="Y" & tmp$pVal.M8vsM7<0.05 &
tmp$cooper.primates.M7.M8_p_val<0.05, na.rm=T)
draw.triple.venn(area1dginn, area2jean, area3coop,
n12, n23, n13, n123,
category=c("DGINN-Young-primate", "Young-primate", "Cooper-primate"))
@
%\subsection{Comparaison des codons?}
%Subtable with lines with both methods showing positive selection.
<<eval=FALSE, echo=FALSE>>=
%<<selec>>=
#cas ou selection + dans les 2 cas
sel<-na.omit(tab[(tab$pVal.M8vsM7<0.05 & tab$PValue_PamlM7M8<0.05),c("Gene.name", "pVal.M8vsM7", "PValue_PamlM7M8", "whole.gene.dN.dS.model.0", "Omega_PamlM7M8", "Number.of.codons.with.BEB....0.9", "Codons.under.positive.selection..BEB..0.9...alignment.position.", "NbSites_PamlM7M8","PSS_PamlM7M8")])
......@@ -228,10 +354,10 @@ dim(sel)
head(sel)
@
<<nsites, eval=FALSE, echo=FALSE>>=
%<<nsites>>=
plot(sel$Number.of.codons.with.BEB....0.9, sel$NbSites_PamlM7M8)
# toujours plus de codon dans la version de jeanette
# toujours plus de codon dans la version de janet Young
listdginn<-sapply(sel$PSS_PamlM7M8, function(x){
tmp<-strsplit(as.character(x), split=",")[[1]]
......@@ -247,7 +373,7 @@ listjanet<-sapply(sel$Codons.under.positive.selection..BEB..0.9...alignment.posi
tmp<-strsplit(as.character(x), split=",")[[1]]
tmp2<-sapply(tmp, function(x) strsplit(as.character(x), split="_")[[1]][1])
tmp2<-unlist(tmp2)
names(tmp2)<-rep("jeanette", length(tmp2))
names(tmp2)<-rep("young", length(tmp2))
return(unlist(tmp2))
})
......@@ -259,11 +385,23 @@ listjoined<-mapply(c, listdginn, listjanet, SIMPLIFY=FALSE)
\section{Bats DGINN vs Bats Cooper}
\subsection{Read DGINN table, Bats}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Bats Comparisons}
\subsection{Add DGINN results for Bats}
Lecture du tableau.
......@@ -277,25 +415,15 @@ length(unique(tab$cooper.batsGene))
table(unique(tab$cooper.batsGene) %in% unique(dginnbats$Gene))
@
Quels sont les gènes du tableau Cooper qui ne sont pas dans la sortie de DGINN
Which genes in the Cooper table are not in the gene output?
<<>>=
unique(tab$cooper.batsGene)[unique(tab$cooper.batsGene) %in% unique(dginnbats$Gene)==F]
@
Fusionner les tableaux
Merge tables:
<<last>>=
<<bats>>=
add_col<-function(dginnfile=dginnbats, method="PamlM1M2", prefixe="bats_"){
......@@ -320,59 +448,144 @@ tab<-add_col(dginnfile=dginnbats, method="BppM1M2", prefixe="bats_")
tab<-add_col(dginnfile=dginnbats, method="BppM7M8", prefixe="bats_")
head(tab[, c("cooper.batsGene", "cooper.batsAverage_dNdS","cooper.batsM7.M8_p_value", "bats_Omega_PamlM7M8","bats_PValue_PamlM7M8")])
head(na.omit(tab[, c("cooper.batsGene", "cooper.batsAverage_dNdS","cooper.batsM7.M8_p_value", "bats_Omega_PamlM7M8","bats_PValue_PamlM7M8")]))
# Manip pour la colonne BUSTED
tmp<-dginn[dginn$Method=="BUSTED",c("Gene", "Omega", "PosSel", "PValue")]
names(tmp)<-c("cooper.batsGene", "bats_Omega_BUSTED", "bats_PosSel_BUSTED", "bats_PValue_BUSTED")
tab<-merge(tab, tmp, by="cooper.batsGene")
tab<-merge(tab, tmp, all.x=T, by="cooper.batsGene")
tmp<-dginn[dginn$Method=="MEME",c("Gene", "NbSites", "PSS")]
names(tmp)<-c("cooper.batsGene", "bats_NbSites_MEME", "bats_PSS_MEME")
tab<-merge(tab, tmp, by="cooper.batsGene")
tab<-merge(tab, tmp, all.x=T, by="cooper.batsGene")
dim(tab)
@
\subsection{Comparaisons et figures}
Faire un sous tableau.
<<fig>>=
tabbats<-na.omit(tab[, c("cooper.batsGene", "cooper.batsAverage_dNdS","cooper.batsM7.M8_p_value", "bats_Omega_PamlM7M8","bats_PValue_PamlM7M8")])
dim(tabbats)
@
\subsection{Cooper-bats results vs DGINN-bats results}
Comparaison des Omega
\subsubsection{Omega}
<<omegaM7M8bats>>=
plot(tab$cooper.batsAverage_dNdS, tab$bats_Omega_PamlM7M8,
xlab="Omega Cooper", ylab="Omega DGINN")
xlab="Omega Cooper-bats", ylab="Omega DGINN-bats")
@
pvalues pour M7M8
\subsubsection{pvalues pour M7M8}
<<pvalM7M8bats>>=
plot(tab$cooper.batsM7.M8_p_value, tab$bats_PValue_PamlM7M8, pch=20,
xlab="p-value tabJeanette", ylab="p-value DGINN", main="M7vM8 Paml")
xlab="p-value Cooper-bats", ylab="p-value DGINN-bats", main="M7vM8 Paml")
points(tab$cooper.batsM7.M8_p_value[tab$cooper.batsM7.M8_p_value>0.05 & tab$bats_PValue_PamlM7M8<0.05],
tab$bats_PValue_PamlM7M8[tab$cooper.batsM7.M8_p_value>0.05 & tab$bats_PValue_PamlM7M8<0.05],
col="red", pch=20)
points(tab$cooper.batsM7.M8_p_value[tab$cooper.batsM7.M8_p_value<0.05 & tab$bats_PValue_PamlM7M8>0.05],
points(tab$cooper.batsM7.M8_p_value[tab$cooper.batsM7.M8_p_value<0.05 & tab$bats_PValue_PamlM7M8>0.05],
tab$bats_PValue_PamlM7M8[tab$cooper.batsM7.M8_p_value<0.05 & tab$bats_PValue_PamlM7M8>0.05],
col="green", pch=20)
legend("topleft", c("<0.05 in PamlM7M8 but >0.05 in Jeanette M8vsM7",
"<0.05 in Jeanette M8vsM7 but >0.05 in PamlM7M8"),
legend("topleft", c("<0.05 in DGINN-bats but >0.05 in Cooper-bats",
"<0.05 in Cooper-bats but >0.05 in DGINN-bats"),
pch=20, col=c("red", "green"))
@
\subsection{Comparaison Cooper-Hawkins}
\subsubsection{pvalues pour M7M8}
<<pvalM7M8comp2>>=
plot(tab$cooper.batsM7.M8_p_value, tab$hawkins_Positive.Selection..M8vM8a.p.value,
pch=20, xlab="p-value Cooper-bats", ylab="p-value hawkins-bats", main="M7vM8 Paml")
points(tab$cooper.batsM7.M8_p_value[tab$cooper.batsM7.M8_p_value>0.05 &
tab$hawkins_Positive.Selection..M8vM8a.p.value<0.05],
tab$hawkins_Positive.Selection..M8vM8a.p.value[tab$cooper.batsM7.M8_p_value>0.05 &
tab$hawkins_Positive.Selection..M8vM8a.p.value<0.05],
col="red", pch=20)
points(tab$cooper.batsM7.M8_p_value[tab$cooper.batsM7.M8_p_value<0.05 &
tab$hawkins_Positive.Selection..M8vM8a.p.value>0.05],
tab$hawkins_Positive.Selection..M8vM8a.p.value[tab$cooper.batsM7.M8_p_value<0.05 &
tab$hawkins_Positive.Selection..M8vM8a.p.value>0.05],
col="green", pch=20)
legend("topleft", c("<0.05 in Hawkins but >0.05 in Cooper",
"<0.05 in Cooper but >0.05 in Hawkins"),
pch=20, col=c("red", "green"))
@
\subsection{Comparaison dginn-Hawkins}
<<pvalM7M8compautre>>=
plot(tab$hawkins_Positive.Selection..M8vM8a.p.value, tab$bats_PValue_PamlM7M8,
pch=20, xlab="p-value hawkins-bats", ylab="p-value DGINN-bats", main="M7vM8 Paml")
points(tab$hawkins_Positive.Selection..M8vM8a.p.value[tab$hawkins_Positive.Selection..M8vM8a.p.value>0.05 &
tab$bats_PValue_PamlM7M8<0.05],
tab$bats_PValue_PamlM7M8[tab$hawkins_Positive.Selection..M8vM8a.p.value>0.05 &
tab$bats_PValue_PamlM7M8<0.05], col="red", pch=20)
points(tab$hawkins_Positive.Selection..M8vM8a.p.value[tab$hawkins_Positive.Selection..M8vM8a.p.value<0.05 &
tab$bats_PValue_PamlM7M8>0.05],
tab$bats_PValue_PamlM7M8[tab$hawkins_Positive.Selection..M8vM8a.p.value<0.05 &
tab$bats_PValue_PamlM7M8>0.05], col="green", pch=20)
legend("topleft", c("<0.05 in DGINN-bats but >0.05 in Hawkins",
"<0.05 in Hawkinsbut >0.05 in DGINN-bats"),
pch=20, col=c("red", "green"))
@
Apparemment, ces comparaisons n'ont pas trop de sens.
\subsection{Diagramme de Venn}
I will draw a venn diagramm for the positive genes in the 3 analyses.
\subsubsection{subtab}
<<subbats>>=
tmp<-na.omit(tab[,c("Gene.name", "bats_PValue_PamlM7M8", "hawkins_Positive.Selection..M8vM8a.p.value", "cooper.batsM7.M8_p_value")])
dim(tmp)
@
154 genes (present in the 3 experiments)
\subsubsection{figure}
<<vennbats>>=
area1dginn<-sum(tmp$bats_PValue_PamlM7M8<0.05, na.rm=T)
area2hawk<-sum(tmp$hawkins_Positive.Selection..M8vM8a.p.value<0.05, na.rm=T)
area3coop<-sum(tmp$cooper.batsM7.M8_p_value<0.05, na.rm=T)
n12<-sum(tmp$bats_PValue_PamlM7M8<0.05 & tmp$hawkins_Positive.Selection..M8vM8a.p.value<0.05, na.rm=T)
n23<-sum(tmp$hawkins_Positive.Selection..M8vM8a.p.value<0.05 & tmp$cooper.batsM7.M8_p_value<0.05, na.rm=T)
n13<-sum(tmp$bats_PValue_PamlM7M8<0.05 & tmp$cooper.batsM7.M8_p_value<0.05, na.rm=T)
n123<-sum(tmp$bats_PValue_PamlM7M8<0.05 & tmp$hawkins_Positive.Selection..M8vM8a.p.value<0.05 & tmp$cooper.batsM7.M8_p_value<0.05, na.rm=T)
draw.triple.venn(area1dginn, area2hawk, area3coop, n12, n23, n13, n123, category=c("dginn", "hawkins", "cooper"))
@
\end{document}
......
No preview for this file type
This diff is collapsed.
No preview for this file type
No preview for this file type
No preview for this file type
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment