Commit 50ae18da authored by mcariou's avatar mcariou
Browse files

update dataset

parent 040507e4
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -56,8 +56,29 @@ tmp<-na.omit(tab[,c("Gene.name", "bats_BUSTED", "bats_BppM1M2", "bats_BppM7M8",
"bats_codemlM1M2", "bats_codemlM7M8", "dginn.primate_codemlM1M2",
"dginn.primate_codemlM7M8", "dginn.primate_BppM1M2",
"dginn.primate_BppM7M8", "dginn.primate_BUSTED")])
col<-c("Gene.name", "bats_BUSTED", "bats_BppM1M2", "bats_BppM7M8",
"bats_codemlM1M2", "bats_codemlM7M8", "dginn.primate_codemlM1M2",
"dginn.primate_codemlM7M8", "dginn.primate_BppM1M2",
"dginn.primate_BppM7M8", "dginn.primate_BUSTED")
dim(tmp)
@
\subsection{Omega plot}
<<>>=
x=as.numeric(as.character(tab$dginn.primate_omegaM0Bpp[tab$status=="shared"]))
y=as.numeric(as.character(tab$bats_omegaM0Bpp[tab$status=="shared"]))
names(x)<-tab$Gene.name[tab$status=="shared"]
plot(x,y, xlab="bpp omega primate", ylab="bpp omega bats", cex=0.5)
abline(0,1)
abline(lm(y~x), col="red")
text(x[x>0.5 &y<0.4], (y[x>0.5 &y<0.4]+0.01), names(x)[x>0.5 &y<0.4])
text(x[x<0.45 &y>0.45], (y[x<0.45 &y>0.45]+0.01), names(x)[x<0.45 &y>0.45])
@
\subsection{Mondrian}
<<mondrianbats>>=
......
No preview for this file type
......@@ -103,7 +103,7 @@ Analysis were formatted by the script covid\_comp\_script0\_table.Rnw.
\hlkwd{dim}\hlstd{(tab)}
\end{alltt}
\begin{verbatim}
## [1] 442 55
## [1] 442 56
\end{verbatim}
\end{kframe}
\end{knitrout}
......@@ -119,6 +119,10 @@ Analysis were formatted by the script covid\_comp\_script0\_table.Rnw.
\hlstr{"bats_codemlM1M2"}\hlstd{,} \hlstr{"bats_codemlM7M8"}\hlstd{,} \hlstr{"dginn.primate_codemlM1M2"}\hlstd{,}
\hlstr{"dginn.primate_codemlM7M8"}\hlstd{,} \hlstr{"dginn.primate_BppM1M2"}\hlstd{,}
\hlstr{"dginn.primate_BppM7M8"}\hlstd{,} \hlstr{"dginn.primate_BUSTED"}\hlstd{)])}
\hlstd{col}\hlkwb{<-}\hlkwd{c}\hlstd{(}\hlstr{"Gene.name"}\hlstd{,} \hlstr{"bats_BUSTED"}\hlstd{,} \hlstr{"bats_BppM1M2"}\hlstd{,} \hlstr{"bats_BppM7M8"}\hlstd{,}
\hlstr{"bats_codemlM1M2"}\hlstd{,} \hlstr{"bats_codemlM7M8"}\hlstd{,} \hlstr{"dginn.primate_codemlM1M2"}\hlstd{,}
\hlstr{"dginn.primate_codemlM7M8"}\hlstd{,} \hlstr{"dginn.primate_BppM1M2"}\hlstd{,}
\hlstr{"dginn.primate_BppM7M8"}\hlstd{,} \hlstr{"dginn.primate_BUSTED"}\hlstd{)}
\hlkwd{dim}\hlstd{(tmp)}
\end{alltt}
\begin{verbatim}
......@@ -126,6 +130,36 @@ Analysis were formatted by the script covid\_comp\_script0\_table.Rnw.
\end{verbatim}
\end{kframe}
\end{knitrout}
\subsection{Omega plot}
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{x}\hlkwb{=}\hlkwd{as.numeric}\hlstd{(}\hlkwd{as.character}\hlstd{(tab}\hlopt{$}\hlstd{dginn.primate_omegaM0Bpp[tab}\hlopt{$}\hlstd{status}\hlopt{==}\hlstr{"shared"}\hlstd{]))}
\end{alltt}
{\ttfamily\noindent\color{warningcolor}{\#\# Warning: NAs introduits lors de la conversion automatique}}\begin{alltt}
\hlstd{y}\hlkwb{=}\hlkwd{as.numeric}\hlstd{(}\hlkwd{as.character}\hlstd{(tab}\hlopt{$}\hlstd{bats_omegaM0Bpp[tab}\hlopt{$}\hlstd{status}\hlopt{==}\hlstr{"shared"}\hlstd{]))}
\end{alltt}
{\ttfamily\noindent\color{warningcolor}{\#\# Warning: NAs introduits lors de la conversion automatique}}\begin{alltt}
\hlkwd{names}\hlstd{(x)}\hlkwb{<-}\hlstd{tab}\hlopt{$}\hlstd{Gene.name[tab}\hlopt{$}\hlstd{status}\hlopt{==}\hlstr{"shared"}\hlstd{]}
\hlkwd{plot}\hlstd{(x,y,} \hlkwc{xlab}\hlstd{=}\hlstr{"bpp omega primate"}\hlstd{,} \hlkwc{ylab}\hlstd{=}\hlstr{"bpp omega bats"}\hlstd{,} \hlkwc{cex}\hlstd{=}\hlnum{0.5}\hlstd{)}
\hlkwd{abline}\hlstd{(}\hlnum{0}\hlstd{,}\hlnum{1}\hlstd{)}
\hlkwd{abline}\hlstd{(}\hlkwd{lm}\hlstd{(y}\hlopt{~}\hlstd{x),} \hlkwc{col}\hlstd{=}\hlstr{"red"}\hlstd{)}
\hlkwd{text}\hlstd{(x[x}\hlopt{>}\hlnum{0.5} \hlopt{&}\hlstd{y}\hlopt{<}\hlnum{0.4}\hlstd{], (y[x}\hlopt{>}\hlnum{0.5} \hlopt{&}\hlstd{y}\hlopt{<}\hlnum{0.4}\hlstd{]}\hlopt{+}\hlnum{0.01}\hlstd{),} \hlkwd{names}\hlstd{(x)[x}\hlopt{>}\hlnum{0.5} \hlopt{&}\hlstd{y}\hlopt{<}\hlnum{0.4}\hlstd{])}
\hlkwd{text}\hlstd{(x[x}\hlopt{<}\hlnum{0.45} \hlopt{&}\hlstd{y}\hlopt{>}\hlnum{0.45}\hlstd{], (y[x}\hlopt{<}\hlnum{0.45} \hlopt{&}\hlstd{y}\hlopt{>}\hlnum{0.45}\hlstd{]}\hlopt{+}\hlnum{0.01}\hlstd{),} \hlkwd{names}\hlstd{(x)[x}\hlopt{<}\hlnum{0.45} \hlopt{&}\hlstd{y}\hlopt{>}\hlnum{0.45}\hlstd{])}
\end{alltt}
\end{kframe}
\includegraphics[width=\maxwidth]{figure/unnamed-chunk-3-1}
\end{knitrout}
\subsection{Mondrian}
\begin{knitrout}
......
......@@ -264,7 +264,7 @@ tab<-merge(tab,dginnbats, by="cooper.batsGene", all.x=T)
<<>>=
write.table(tab, "covid_comp_complete.txt", row.names=FALSE, quote=FALSE, sep="\t")
write.table(tab, "covid_comp_complete_old.txt", row.names=FALSE, quote=FALSE, sep="\t")
@
......@@ -432,6 +432,7 @@ TMPRSS2 in bats
dginnbats[dginnbats$Gene.name=="TMPRSS2",]
# keeping the uncut one
# renaming the other one TMPRSS2_cut
dginnbats$Gene.name<-as.character(dginnbats$Gene.name)
dginnbats[dginnbats$bats_File=="TMPRSS2_bat_select_cut_mafft_prank","Gene.name"]<-"TMPRSS2_cut"
@
......@@ -491,5 +492,35 @@ write.table(tab, "covid_comp_alldginn.txt", sep="\t")
@
\section{Complete data}
Merge the previous tab with J Young's original table. \textbf{Will replace the 1st part of this script}
<<>>=
young<-read.delim(paste0(workdir,
"data/COVID_PAMLresults_332hits_plusBatScreens_2020_Apr14.csv"),
fill=T, h=T, dec=",")
dim(young)
@
How many genes in the Young table are not in the DGINN table. And who are they?
<<>>=
table(young$Gene.name %in% tab$Gene.name)
young[(young$Gene.name %in% tab$Gene.name)==FALSE, "Gene.name"]
@
Merge them and keep only the krogan genes
<<>>=
tablo<-merge(young, tab, by="Gene.name", all.x=TRUE)
write.table(tablo, "covid_comp_complete.txt", row.names=FALSE, quote=TRUE, sep="\t")
@
\end{document}
......@@ -517,7 +517,7 @@ DGINN results from different analysis.
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlkwd{write.table}\hlstd{(tab,} \hlstr{"covid_comp_complete.txt"}\hlstd{,} \hlkwc{row.names}\hlstd{=}\hlnum{FALSE}\hlstd{,} \hlkwc{quote}\hlstd{=}\hlnum{FALSE}\hlstd{,} \hlkwc{sep}\hlstd{=}\hlstr{"\textbackslash{}t"}\hlstd{)}
\hlkwd{write.table}\hlstd{(tab,} \hlstr{"covid_comp_complete_old.txt"}\hlstd{,} \hlkwc{row.names}\hlstd{=}\hlnum{FALSE}\hlstd{,} \hlkwc{quote}\hlstd{=}\hlnum{FALSE}\hlstd{,} \hlkwc{sep}\hlstd{=}\hlstr{"\textbackslash{}t"}\hlstd{)}
\end{alltt}
\end{kframe}
\end{knitrout}
......@@ -1066,11 +1066,10 @@ TMPRSS2 in bats
\begin{alltt}
\hlcom{# keeping the uncut one}
\hlcom{# renaming the other one TMPRSS2_cut}
\hlstd{dginnbats}\hlopt{$}\hlstd{Gene.name}\hlkwb{<-}\hlkwd{as.character}\hlstd{(dginnbats}\hlopt{$}\hlstd{Gene.name)}
\hlstd{dginnbats[dginnbats}\hlopt{$}\hlstd{bats_File}\hlopt{==}\hlstr{"TMPRSS2_bat_select_cut_mafft_prank"}\hlstd{,}\hlstr{"Gene.name"}\hlstd{]}\hlkwb{<-}\hlstr{"TMPRSS2_cut"}
\end{alltt}
{\ttfamily\noindent\color{warningcolor}{\#\# Warning in `[<-.factor`(`*tmp*`, iseq, value = "{}TMPRSS2\_cut"{}): invalid factor level, NA generated}}\end{kframe}
\end{kframe}
\end{knitrout}
RIPK1: ANcestral version kept, suppress it "RIPK1\_sequences\_filtered\_longestORFs\_mafft\_mincov\_prank"
......@@ -1255,7 +1254,7 @@ GNG5
## [19] "SIGMAR1" "SLC44A2[0-2820]" "SLC44A2[2819-3792]"
## [22] "TLE5" "USP13" "ZC3H18[0-1101]"
## [25] "ZC3H18[1100-3678]" "FGFR1OP" "ELOB"
## [28] "REEP6_like" "SELENOS" NA
## [28] "REEP6_like" "SELENOS" "TMPRSS2_cut"
\end{verbatim}
\begin{alltt}
\hlkwd{length}\hlstd{(dginnbats}\hlopt{$}\hlstd{Gene.name[(dginnbats}\hlopt{$}\hlstd{Gene.name} \hlopt{%in%} \hlstd{dginnT}\hlopt{$}\hlstd{Gene.name)}\hlopt{==}\hlnum{FALSE}\hlstd{])}
......@@ -1295,5 +1294,63 @@ GNG5
\end{knitrout}
\section{Complete data}
Merge the previous tab with J Young's original table. \textbf{Will replace the 1st part of this script}
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{young}\hlkwb{<-}\hlkwd{read.delim}\hlstd{(}\hlkwd{paste0}\hlstd{(workdir,}
\hlstr{"data/COVID_PAMLresults_332hits_plusBatScreens_2020_Apr14.csv"}\hlstd{),}
\hlkwc{fill}\hlstd{=T,} \hlkwc{h}\hlstd{=T,} \hlkwc{dec}\hlstd{=}\hlstr{","}\hlstd{)}
\hlkwd{dim}\hlstd{(young)}
\end{alltt}
\begin{verbatim}
## [1] 332 84
\end{verbatim}
\end{kframe}
\end{knitrout}
How many genes in the Young table are not in the DGINN table. And who are they?
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlkwd{table}\hlstd{(young}\hlopt{$}\hlstd{Gene.name} \hlopt{%in%} \hlstd{tab}\hlopt{$}\hlstd{Gene.name)}
\end{alltt}
\begin{verbatim}
##
## FALSE TRUE
## 15 317
\end{verbatim}
\begin{alltt}
\hlstd{young[(young}\hlopt{$}\hlstd{Gene.name} \hlopt{%in%} \hlstd{tab}\hlopt{$}\hlstd{Gene.name)}\hlopt{==}\hlnum{FALSE}\hlstd{,} \hlstr{"Gene.name"}\hlstd{]}
\end{alltt}
\begin{verbatim}
## [1] C19orf52 ERO1LB ATP5L NUPL1 ADCK4 SPG20
## [7] WHSC1 FAM134C 01/03/2020 VIMP TCEB1 C1orf50
## [13] TOMM70A KIAA1033 TCEB2
## 332 Levels: 01/03/2020 AAR2 AASS AATF ABCC1 ACAD9 ACADM ... ZYG11B
\end{verbatim}
\end{kframe}
\end{knitrout}
Merge them and keep only the krogan genes
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{tablo}\hlkwb{<-}\hlkwd{merge}\hlstd{(young, tab,} \hlkwc{by}\hlstd{=}\hlstr{"Gene.name"}\hlstd{,} \hlkwc{all.x}\hlstd{=}\hlnum{TRUE}\hlstd{)}
\hlkwd{write.table}\hlstd{(tablo,} \hlstr{"covid_comp_complete.txt"}\hlstd{,} \hlkwc{row.names}\hlstd{=}\hlnum{FALSE}\hlstd{,} \hlkwc{quote}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{sep}\hlstd{=}\hlstr{"\textbackslash{}t"}\hlstd{)}
\end{alltt}
\end{kframe}
\end{knitrout}
\end{document}
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment