Skip to content
Snippets Groups Projects
Commit f58fd485 authored by your name's avatar your name
Browse files

rapport 2: comparaison des analyses de Jeanette et DGINN sur les mêmes alignements

parents
No related branches found
No related tags found
No related merge requests found
source diff could not be displayed: it is too large. Options to address this: view the blob.
\documentclass[11pt, oneside]{article} % use "amsart" instead of "article" for AMSLaTeX format
%\usepackage{geometry} % See geometry.pdf to learn the layout options. There are lots.
%\geometry{letterpaper} % ... or a4paper or a5paper or ...
%\geometry{landscape} % Activate for for rotated page geometry
%\usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent
%\usepackage{graphicx} % Use pdf, png, jpg, or eps with pdflatex; use eps in DVI mode
% TeX will automatically convert eps --> pdf in pdflatex
%\usepackage{amssymb}
\usepackage[utf8]{inputenc}
%\usepackage[cyr]{aeguill}
%\usepackage[francais]{babel}
%\usepackage{hyperref}
\title{Positive selection on genes interacting with SARS-Cov2, comparison of different analysis}
\author{Marie Cariou}
\date{Mai 2020} % Activate to display a given date or no date
\begin{document}
\maketitle
\section{Files manipulations}
I will compare Janet results to DGINN results, on the SAME alignment.
\subsection{Read Janet table}
<<>>=
tab<-read.delim("/home/adminmarie/Documents/CIRI_BIBS_projects/2020_05_Etienne_covid/data/COVID_PAMLresults_332hits_plusBatScreens_2020_Apr14.csv",
fill=T, h=T, dec=",")
dim(tab)
names(tab)
@
\subsection{Read DGINN table}
<<>>=
dginn<-read.delim("/home/adminmarie/Documents/CIRI_BIBS_projects/2020_05_Etienne_covid/data/summary.res",
fill=T, h=T)
dim(dginn)
names(dginn)
@
\subsection{Joining table}
\subsubsection{Based on which column?}
<<>>=
head(tab)[,1:5]
# gene avec un nom bizar dans certaines colomne
tab[158,1:10]
#
length(unique(dginn$Gene))
length(unique(tab$PreyGene))
length(unique(tab$Gene.name))
#quelle paire de colonne contient le plus de noms identiques
sum(unique(dginn$Gene) %in% unique(tab$PreyGene))
sum(unique(dginn$Gene) %in% unique(tab$Gene.name))
# dginn$Gene et tab$Gene.name presque identiques sauf 1 ligne. Je soupçonne que c'est celle là:
tab[158,1:10]
# Verif:
tab[,1:10][(tab$Gene.name %in% unique(dginn$Gene))==F,]
# yep
# Remplacement manuel par
as.character(unique(dginn$Gene)[(unique(dginn$Gene) %in% tab$Gene.name)==F])
# dans le tableau de Janet
val_remp=as.character(unique(dginn$Gene)[(unique(dginn$Gene) %in% tab$Gene.name)==F])
tab$Gene.name<-as.character(tab$Gene.name)
tab$Gene.name[158]<-val_remp
sum(unique(dginn$Gene) %in% unique(tab$Gene.name))
@
\subsubsection{new columns}
<<>>=
add_col<-function(method="PamlM1M2"){
tmp<-dginn[dginn$Method==method,
c("Gene", "Omega", "PosSel", "PValue", "NbSites", "PSS")]
names(tmp)<-c("Gene.name", paste0("Omega_", method),
paste0("PosSel_", method), paste0("PValue_", method), paste0("NbSites_", method), paste0("PSS_", method))
tab<-merge(tab, tmp, by="Gene.name")
return(tab)
}
tab<-add_col("PamlM1M2")
tab<-add_col("PamlM7M8")
tab<-add_col("BppM1M2")
tab<-add_col("BppM7M8")
# Manip pour la colonne BUSTED
tmp<-dginn[dginn$Method=="BUSTED",c("Gene", "Omega", "PosSel", "PValue")]
names(tmp)<-c("Gene.name", "Omega_BUSTED", "PosSel_BUSTED", "PValue_BUSTED")
tab<-merge(tab, tmp, by="Gene.name")
tmp<-dginn[dginn$Method=="MEME",c("Gene", "NbSites", "PSS")]
names(tmp)<-c("Gene.name", "NbSites_MEME", "PSS_MEME")
tab<-merge(tab, tmp, by="Gene.name")
@
\subsubsection{Write new table}
<<>>=
write.table(tab, "COVID_PAMLresults_332hits_plusBatScreens_plusDGINN_20200506.txt",
row.names=F, quote=F, sep="\t")
@
\subsection{Figure}
\subsubsection{Omega}
Comparaison des Omega: colonne L "whole.gene.dN.dS.model.0" VS colonne "omega" dans la sortie de dginn.
<<omegaM7M8>>=
plot(tab$whole.gene.dN.dS.model.0, tab$Omega_PamlM7M8,
xlab="Omega tabJanet", ylab="Omega DGINN")
@
Quels sont les 2 gènes qui s'écartent de la bissectrice?
<<>>=
tab[tab$whole.gene.dN.dS.model.0<0.2 & tab$Omega_PamlM7M8>0.4,c("Gene.name")]
tab[tab$whole.gene.dN.dS.model.0<0.6 & tab$Omega_PamlM7M8>0.7,c("Gene.name")]
@
\subsubsection{pvalues pour M7M8}
Cette fois, je compare la colonne R "pVal.M8vsM7", à la colonne "PValue" + ligne "PamlM7M8", pour la sortie de dginn.
<<pvalM7M8>>=
plot(tab$pVal.M8vsM7, tab$PValue_PamlM7M8, pch=20,
xlab="p-value tabJanet", ylab="p-value DGINN", main="M7vM8 Paml")
points(tab$pVal.M8vsM7[tab$pVal.M8vsM7>0.05 & tab$PValue_PamlM7M8<0.05],
tab$PValue_PamlM7M8[tab$pVal.M8vsM7>0.05 & tab$PValue_PamlM7M8<0.05],
col="red", pch=20)
points(tab$pVal.M8vsM7[tab$pVal.M8vsM7<0.05 & tab$PValue_PamlM7M8>0.05],
tab$PValue_PamlM7M8[tab$pVal.M8vsM7<0.05 & tab$PValue_PamlM7M8>0.05],
col="green", pch=20)
legend("topleft", c("<0.05 in PamlM7M8 but >0.05 in Janet M8vsM7","<0.05 in Janet M8vsM7 but >0.05 in PamlM7M8"),
pch=20, col=c("red", "green"))
@
Quels sont les gènes en couleur:
<<>>=
na.omit(tab[(tab$pVal.M8vsM7>0.05 & tab$PValue_PamlM7M8<0.05),c("Gene.name", "pVal.M8vsM7", "PValue_PamlM7M8", "whole.gene.dN.dS.model.0", "Omega_PamlM7M8")])
na.omit(tab[(tab$pVal.M8vsM7<0.05 & tab$PValue_PamlM7M8>0.05),c("Gene.name", "pVal.M8vsM7", "PValue_PamlM7M8", "whole.gene.dN.dS.model.0", "Omega_PamlM7M8")])
@
Focus sur le gène CIT pour lequel la différence est vraiment assez importante:
<<cit>>=
dginn[dginn$Gene=="CIT",]
tab[tab$Gene.name=="CIT",1:20]
@
\subsubsection{Concordance est méthodes}
Est-ce que les gènes avec une faible p-value sont détecté par 1,2,3,4 ou 5 méthodes en général?
<<stripchart>>=
nontab<-tab[tab$pVal.M8vsM7>=0.05,c("Gene.name","PosSel_PamlM1M2", "PosSel_PamlM7M8","PosSel_BppM1M2",
"PosSel_BppM7M8", "PosSel_BUSTED")]
non<-apply(nontab, 1, function(x) sum(x=="Y"))
ouitab<-tab[tab$pVal.M8vsM7<0.05,c("Gene.name","PosSel_PamlM1M2", "PosSel_PamlM7M8","PosSel_BppM1M2",
"PosSel_BppM7M8", "PosSel_BUSTED")]
oui<-apply(ouitab, 1, function(x) sum(x=="Y"))
stripchart(x=list(oui, non), method="jitter", jitter=0.2,
vertical=T, pch=20, cex=0.5,
group.names=c("Yes Janet", "No Janet"),
ylab="Nb YES from dginn")
@
%\subsubsection{Comparaison des codons?}
%Subtable with lines with both methods showing positive selection.
<<eval=FALSE, echo=FALSE>>=
#cas ou selection + dans les 2 cas
sel<-na.omit(tab[(tab$pVal.M8vsM7<0.05 & tab$PValue_PamlM7M8<0.05),c("Gene.name", "pVal.M8vsM7", "PValue_PamlM7M8", "whole.gene.dN.dS.model.0", "Omega_PamlM7M8", "Number.of.codons.with.BEB....0.9", "Codons.under.positive.selection..BEB..0.9...alignment.position.", "NbSites_PamlM7M8","PSS_PamlM7M8")])
dim(sel)
head(sel)
@
<<nsites, eval=FALSE, echo=FALSE>>=
plot(sel$Number.of.codons.with.BEB....0.9, sel$NbSites_PamlM7M8)
# toujours plus de codon dans la version de janet
listdginn<-sapply(sel$PSS_PamlM7M8, function(x){
tmp<-strsplit(as.character(x), split=",")[[1]]
names(tmp)<-rep("dginn", length(tmp))
return(tmp)
})
names(listdginn)<-sel$Gene.name
listjanet<-sapply(sel$Codons.under.positive.selection..BEB..0.9...alignment.position., function(x){
tmp<-strsplit(as.character(x), split=",")[[1]]
tmp2<-sapply(tmp, function(x) strsplit(as.character(x), split="_")[[1]][1])
tmp2<-unlist(tmp2)
names(tmp2)<-rep("janet", length(tmp2))
return(unlist(tmp2))
})
names(listjanet)<-sel$Gene.name
listjoined<-mapply(c, listdginn, listjanet, SIMPLIFY=FALSE)
@
\end{document}
File added
This diff is collapsed.
File added
File added
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment