Commit 17105310 authored by your name's avatar your name
Browse files

script for data formating. Big table creation

parent fbde95f6
\documentclass[11pt, oneside]{article} % use "amsart" instead of "article" for AMSLaTeX format
%\usepackage{geometry} % See geometry.pdf to learn the layout options. There are lots.
%\geometry{letterpaper} % ... or a4paper or a5paper or ...
%\geometry{landscape} % Activate for for rotated page geometry
%\usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent
%\usepackage{graphicx} % Use pdf, png, jpg, or eps with pdflatex; use eps in DVI mode
% TeX will automatically convert eps --> pdf in pdflatex
%\usepackage{amssymb}
\usepackage[utf8]{inputenc}
%\usepackage[cyr]{aeguill}
%\usepackage[francais]{babel}
%\usepackage{hyperref}
\title{Positive selection on genes interacting with SARS-Cov2, comparison of different analysis}
\author{Marie Cariou}
\date{Janvier 2021} % Activate to display a given date or no date
\begin{document}
\maketitle
\tableofcontents
\newpage
\section{Files manipulations}
\subsection{Read Janet Young's table}
<<>>=
workdir<-"/home/adminmarie/Documents/CIRI_BIBS_projects/2020_05_Etienne_covid/"
tab<-read.delim(paste0(workdir,
"data/COVID_PAMLresults_332hits_plusBatScreens_2020_Apr14.csv"),
fill=T, h=T, dec=",")
dim(tab)
@
\subsection{Read DGINN Young table}
<<>>=
dginnY<-read.delim(paste0(workdir,
"data/summary_primate_young.res"),
fill=T, h=T)
dim(dginnY)
@
\subsection{Joining Young and DGINN Young table}
<<>>=
# correct gene names (MARC1)
val_remp=as.character(unique(dginnY$Gene)[(unique(dginnY$Gene) %in%
tab$Gene.name)==F])
tab$Gene.name<-as.character(tab$Gene.name)
tab$Gene.name[158]<-val_remp
sum(unique(dginnY$Gene) %in% unique(tab$Gene.name))
@
<<>>=
add_col<-function(method="PamlM1M2"){
tmp<-dginnY[dginnY$Method==method,
c("Gene", "Omega", "PosSel", "PValue", "NbSites", "PSS")]
names(tmp)<-c("Gene.name", paste0("Omega_", method),
paste0("PosSel_", method), paste0("PValue_", method),
paste0("NbSites_", method), paste0("PSS_", method))
tab<-merge(tab, tmp, by="Gene.name")
return(tab)
}
tab<-add_col("PamlM1M2")
tab<-add_col("PamlM7M8")
tab<-add_col("BppM1M2")
tab<-add_col("BppM7M8")
# Manip pour la colonne BUSTED
tmp<-dginnY[dginnY$Method=="BUSTED",c("Gene", "Omega", "PosSel", "PValue")]
names(tmp)<-c("Gene.name", "Omega_BUSTED", "PosSel_BUSTED", "PValue_BUSTED")
tab<-merge(tab, tmp, by="Gene.name")
tmp<-dginnY[dginnY$Method=="MEME",c("Gene", "NbSites", "PSS")]
names(tmp)<-c("Gene.name", "NbSites_MEME", "PSS_MEME")
tab<-merge(tab, tmp, by="Gene.name")
@
\subsection{Read DGINN Table}
<<>>=
dginnT<-read.delim(paste0(workdir,
"data/DGINN_202005281649summary_cleaned.csv"),
fill=T, h=T, sep=",")
dim(dginnT)
names(dginnT)
# Number of genes in dginn-primate output not present in the original table
dginnT[(dginnT$Gene %in% tab$Gene.name)==F,"Gene"]
# This includes paralogs, recombinations found by DGINN and additionnal genes
# included on purpose
# Number of genes from the original list not present in DGINN output
tab[(tab$Gene.name %in% dginnT$Gene)==F,"Gene.name"]
names(dginnT)<-c("File", "Name", "Gene.name", "GeneSize",
"dginn-primate_NbSpecies", "dginn-primate_omegaM0Bpp",
"dginn-primate_omegaM0codeml", "dginn-primate_BUSTED",
"dginn-primate_BUSTED.p.value", "dginn-primate_MEME.NbSites",
"dginn-primate_MEME.PSS", "dginn-primate_BppM1M2",
"dginn-primate_BppM1M2.p.value", "dginn-primate_BppM1M2.NbSites",
"dginn-primate_BppM1M2.PSS", "dginn-primate_BppM7M8",
"dginn-primate_BppM7M8.p.value", "dginn-primate_BppM7M8.NbSites",
"dginn-primate_BppM7M8.PSS", "dginn-primate_codemlM1M2",
"dginn-primate_codemlM1M2.p.value", "dginn-primate_codemlM1M2.NbSites",
"dginn-primate_codemlM1M2.PSS", "dginn-primate_codemlM7M8",
"dginn-primate_codemlM7M8.p.value", "dginn-primate_codemlM7M8.NbSites",
"dginn-primate_codemlM7M8.PSS")
@
\subsection{Join Table and DGINN table}
<<>>=
tab<-merge(tab,dginnT, by="Gene.name", all.x=T)
@
\subsection{Add DGINN results on bat dataset}
DGINN results from different analysis.
<<>>=
# original table
dginnbats<-read.delim(paste0(workdir,
"data/DGINN_202005281339summary_cleaned.tab"),
fill=T, h=T)
# rerun on corrected alignment
dginnbatsnew1<-read.delim(paste0(workdir,
"data/DGINN_202011262248_summary.tab"),
fill=T, h=T)
dginnbatsnew2<-read.delim(paste0(workdir,
"data/DGINN_202012192053_summary.tab"),
fill=T, h=T)
# colomne choice, BUSTED and Bppml form first file, codeml from the other one
dginnbatsnew<-dginnbatsnew1
dginnbatsnew$omegaM0codeml<-dginnbatsnew2$omegaM0codeml
dginnbatsnew$codemlM1M2<-dginnbatsnew2$codemlM1M2
dginnbatsnew$codemlM1M2_p.value<-dginnbatsnew2$codemlM1M2_p.value
dginnbatsnew$codemlM1M2_NbSites<-dginnbatsnew2$codemlM1M2_NbSites
dginnbatsnew$codemlM1M2_PSS<-dginnbatsnew2$codemlM1M2_PSS
dginnbatsnew$codemlM7M8<-dginnbatsnew2$codemlM7M8
dginnbatsnew$codemlM7M8_p.value<-dginnbatsnew2$codemlM7M8_p.value
dginnbatsnew$codemlM7M8_NbSites<-dginnbatsnew2$codemlM7M8_NbSites
dginnbatsnew$codemlM7M8_PSS<-dginnbatsnew2$codemlM7M8_PSS
####
## RIPK1 is actually a primat results
## 1. Take it and put it at the right place
ripk1<-as.vector(dginnbatsnew[dginnbatsnew$Gene=="RIPK1",])
tab$`dginn-primate_omegaM0Bpp`<-as.numeric(as.character(tab$`dginn-primate_omegaM0Bpp`))
tab$`dginn-primate_BUSTED.p.value`<-as.numeric(as.character(tab$`dginn-primate_BUSTED.p.value`))
tab$`dginn-primate_BppM1M2.p.value`<-as.numeric(as.character(tab$`dginn-primate_BppM1M2.p.value`))
tab$`dginn-primate_BppM7M8.p.value`<-as.numeric(as.character(tab$`dginn-primate_BppM7M8.p.value`))
tab$`dginn-primate_BppM7M8.PSS`<-as.numeric(as.character(tab$`dginn-primate_BppM7M8.PSS`))
tab$`dginn-primate_codemlM1M2.p.value`<-as.numeric(as.character(tab$`dginn-primate_codemlM1M2.p.value`))
tab$`dginn-primate_codemlM1M2.PSS`<-as.numeric(as.character(tab$`dginn-primate_codemlM1M2.PSS`))
tab$`dginn-primate_codemlM7M8.p.value`<-as.numeric(as.character(tab$`dginn-primate_codemlM7M8.p.value`))
tab$`dginn-primate_codemlM7M8.PSS`<-as.numeric(as.character(tab$`dginn-primate_codemlM7M8.PSS`))
tab[tab$Gene.name=="RIPK1","GeneSize"]<-ripk1$GeneSize
tab[tab$Gene.name=="RIPK1","dginn-primate_NbSpecies"]<-ripk1$NbSpecies
tab[tab$Gene.name=="RIPK1","dginn-primate_omegaM0Bpp"]<-ripk1$omegaM0Bpp
tab[tab$Gene.name=="RIPK1","dginn-primate_omegaM0codeml"]<-ripk1$omegaM0codeml
tab[tab$Gene.name=="RIPK1","dginn-primate_BUSTED"]<-ripk1$BUSTED
tab[tab$Gene.name=="RIPK1","dginn-primate_BUSTED.p.value"]<-ripk1$BUSTED_p.value
tab[tab$Gene.name=="RIPK1","dginn-primate_MEME.NbSites"]<-ripk1$MEME_NbSites
tab[tab$Gene.name=="RIPK1","dginn-primate_MEME.PSS"]<-as.numeric(as.character(ripk1$MEME_PSS))
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM1M2"]<-ripk1$BppM1M2
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM1M2.p.value"]<-ripk1$BppM1M2_p.value
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM1M2.NbSites"]<-ripk1$BppM1M2_NbSites
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM1M2.PSS"]<-ripk1$BppM1M2_PSS
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM7M8"]<-ripk1$BppM7M8
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM7M8.p.value"]<-ripk1$BppM7M8_p.value
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM7M8.NbSites"]<-ripk1$BppM7M8_NbSites
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM7M8.PSS"]<-ripk1$BppM7M8_PSS
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM1M2"]<-ripk1$codemlM1M2
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM1M2.p.value"]<-ripk1$codemlM1M2_p.value
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM1M2.NbSites"]<-ripk1$codemlM1M2_NbSites
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM1M2.PSS"]<-ripk1$codemlM1M2_PSS
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM7M8"]<-ripk1$codemlM7M8
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM7M8.p.value"]<-ripk1$codemlM7M8_p.value
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM7M8.NbSites"]<-ripk1$codemlM7M8_NbSites
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM7M8.PSS"]<-ripk1$codemlM7M8_PSS
## 2. Remove it
dginnbatsnew<-dginnbatsnew[dginnbatsnew$Gene!="RIPK1",]
## suppress redundant lines
dginnbats<-dginnbats[(dginnbats$Gene %in% dginnbatsnew$Gene)==FALSE,]
names(dginnbatsnew)<-names(dginnbats)
##############"
dginnbatsnew[,4]<-as.numeric(dginnbatsnew[,4])
dginnbats[,6]<-as.numeric(as.character(dginnbats[,6]))
dginnbats[,8]<-as.character(dginnbats[,8])
dginnbats[,12]<-as.character(dginnbats[,12])
dginnbats[,13]<-as.numeric(as.character(dginnbats[,13]))
dginnbats[,16]<-as.character(dginnbats[,16])
dginnbats[,17]<-as.numeric(as.character(dginnbats[,17]))
## replace by new data
dginnbats<-rbind(dginnbats, dginnbatsnew)
names(dginnbats)<-c("File", "bats_Name", "cooper.batsGene", paste0("bats_",
names(dginnbats)[-(1:3)]))
names(dginnbats)
tab<-merge(tab,dginnbats, by="cooper.batsGene", all.x=T)
@
\subsection{Write the new table}
<<>>=
write.table(tab, "covid_comp_complete.txt", row.names=FALSE, quote=FALSE, sep="\t")
@
\end{document}
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment