Skip to content
Snippets Groups Projects
Commit 17105310 authored by your name's avatar your name
Browse files

script for data formating. Big table creation

parent fbde95f6
No related branches found
No related tags found
No related merge requests found
\documentclass[11pt, oneside]{article} % use "amsart" instead of "article" for AMSLaTeX format
%\usepackage{geometry} % See geometry.pdf to learn the layout options. There are lots.
%\geometry{letterpaper} % ... or a4paper or a5paper or ...
%\geometry{landscape} % Activate for for rotated page geometry
%\usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent
%\usepackage{graphicx} % Use pdf, png, jpg, or eps with pdflatex; use eps in DVI mode
% TeX will automatically convert eps --> pdf in pdflatex
%\usepackage{amssymb}
\usepackage[utf8]{inputenc}
%\usepackage[cyr]{aeguill}
%\usepackage[francais]{babel}
%\usepackage{hyperref}
\title{Positive selection on genes interacting with SARS-Cov2, comparison of different analysis}
\author{Marie Cariou}
\date{Janvier 2021} % Activate to display a given date or no date
\begin{document}
\maketitle
\tableofcontents
\newpage
\section{Files manipulations}
\subsection{Read Janet Young's table}
<<>>=
workdir<-"/home/adminmarie/Documents/CIRI_BIBS_projects/2020_05_Etienne_covid/"
tab<-read.delim(paste0(workdir,
"data/COVID_PAMLresults_332hits_plusBatScreens_2020_Apr14.csv"),
fill=T, h=T, dec=",")
dim(tab)
@
\subsection{Read DGINN Young table}
<<>>=
dginnY<-read.delim(paste0(workdir,
"data/summary_primate_young.res"),
fill=T, h=T)
dim(dginnY)
@
\subsection{Joining Young and DGINN Young table}
<<>>=
# correct gene names (MARC1)
val_remp=as.character(unique(dginnY$Gene)[(unique(dginnY$Gene) %in%
tab$Gene.name)==F])
tab$Gene.name<-as.character(tab$Gene.name)
tab$Gene.name[158]<-val_remp
sum(unique(dginnY$Gene) %in% unique(tab$Gene.name))
@
<<>>=
add_col<-function(method="PamlM1M2"){
tmp<-dginnY[dginnY$Method==method,
c("Gene", "Omega", "PosSel", "PValue", "NbSites", "PSS")]
names(tmp)<-c("Gene.name", paste0("Omega_", method),
paste0("PosSel_", method), paste0("PValue_", method),
paste0("NbSites_", method), paste0("PSS_", method))
tab<-merge(tab, tmp, by="Gene.name")
return(tab)
}
tab<-add_col("PamlM1M2")
tab<-add_col("PamlM7M8")
tab<-add_col("BppM1M2")
tab<-add_col("BppM7M8")
# Manip pour la colonne BUSTED
tmp<-dginnY[dginnY$Method=="BUSTED",c("Gene", "Omega", "PosSel", "PValue")]
names(tmp)<-c("Gene.name", "Omega_BUSTED", "PosSel_BUSTED", "PValue_BUSTED")
tab<-merge(tab, tmp, by="Gene.name")
tmp<-dginnY[dginnY$Method=="MEME",c("Gene", "NbSites", "PSS")]
names(tmp)<-c("Gene.name", "NbSites_MEME", "PSS_MEME")
tab<-merge(tab, tmp, by="Gene.name")
@
\subsection{Read DGINN Table}
<<>>=
dginnT<-read.delim(paste0(workdir,
"data/DGINN_202005281649summary_cleaned.csv"),
fill=T, h=T, sep=",")
dim(dginnT)
names(dginnT)
# Number of genes in dginn-primate output not present in the original table
dginnT[(dginnT$Gene %in% tab$Gene.name)==F,"Gene"]
# This includes paralogs, recombinations found by DGINN and additionnal genes
# included on purpose
# Number of genes from the original list not present in DGINN output
tab[(tab$Gene.name %in% dginnT$Gene)==F,"Gene.name"]
names(dginnT)<-c("File", "Name", "Gene.name", "GeneSize",
"dginn-primate_NbSpecies", "dginn-primate_omegaM0Bpp",
"dginn-primate_omegaM0codeml", "dginn-primate_BUSTED",
"dginn-primate_BUSTED.p.value", "dginn-primate_MEME.NbSites",
"dginn-primate_MEME.PSS", "dginn-primate_BppM1M2",
"dginn-primate_BppM1M2.p.value", "dginn-primate_BppM1M2.NbSites",
"dginn-primate_BppM1M2.PSS", "dginn-primate_BppM7M8",
"dginn-primate_BppM7M8.p.value", "dginn-primate_BppM7M8.NbSites",
"dginn-primate_BppM7M8.PSS", "dginn-primate_codemlM1M2",
"dginn-primate_codemlM1M2.p.value", "dginn-primate_codemlM1M2.NbSites",
"dginn-primate_codemlM1M2.PSS", "dginn-primate_codemlM7M8",
"dginn-primate_codemlM7M8.p.value", "dginn-primate_codemlM7M8.NbSites",
"dginn-primate_codemlM7M8.PSS")
@
\subsection{Join Table and DGINN table}
<<>>=
tab<-merge(tab,dginnT, by="Gene.name", all.x=T)
@
\subsection{Add DGINN results on bat dataset}
DGINN results from different analysis.
<<>>=
# original table
dginnbats<-read.delim(paste0(workdir,
"data/DGINN_202005281339summary_cleaned.tab"),
fill=T, h=T)
# rerun on corrected alignment
dginnbatsnew1<-read.delim(paste0(workdir,
"data/DGINN_202011262248_summary.tab"),
fill=T, h=T)
dginnbatsnew2<-read.delim(paste0(workdir,
"data/DGINN_202012192053_summary.tab"),
fill=T, h=T)
# colomne choice, BUSTED and Bppml form first file, codeml from the other one
dginnbatsnew<-dginnbatsnew1
dginnbatsnew$omegaM0codeml<-dginnbatsnew2$omegaM0codeml
dginnbatsnew$codemlM1M2<-dginnbatsnew2$codemlM1M2
dginnbatsnew$codemlM1M2_p.value<-dginnbatsnew2$codemlM1M2_p.value
dginnbatsnew$codemlM1M2_NbSites<-dginnbatsnew2$codemlM1M2_NbSites
dginnbatsnew$codemlM1M2_PSS<-dginnbatsnew2$codemlM1M2_PSS
dginnbatsnew$codemlM7M8<-dginnbatsnew2$codemlM7M8
dginnbatsnew$codemlM7M8_p.value<-dginnbatsnew2$codemlM7M8_p.value
dginnbatsnew$codemlM7M8_NbSites<-dginnbatsnew2$codemlM7M8_NbSites
dginnbatsnew$codemlM7M8_PSS<-dginnbatsnew2$codemlM7M8_PSS
####
## RIPK1 is actually a primat results
## 1. Take it and put it at the right place
ripk1<-as.vector(dginnbatsnew[dginnbatsnew$Gene=="RIPK1",])
tab$`dginn-primate_omegaM0Bpp`<-as.numeric(as.character(tab$`dginn-primate_omegaM0Bpp`))
tab$`dginn-primate_BUSTED.p.value`<-as.numeric(as.character(tab$`dginn-primate_BUSTED.p.value`))
tab$`dginn-primate_BppM1M2.p.value`<-as.numeric(as.character(tab$`dginn-primate_BppM1M2.p.value`))
tab$`dginn-primate_BppM7M8.p.value`<-as.numeric(as.character(tab$`dginn-primate_BppM7M8.p.value`))
tab$`dginn-primate_BppM7M8.PSS`<-as.numeric(as.character(tab$`dginn-primate_BppM7M8.PSS`))
tab$`dginn-primate_codemlM1M2.p.value`<-as.numeric(as.character(tab$`dginn-primate_codemlM1M2.p.value`))
tab$`dginn-primate_codemlM1M2.PSS`<-as.numeric(as.character(tab$`dginn-primate_codemlM1M2.PSS`))
tab$`dginn-primate_codemlM7M8.p.value`<-as.numeric(as.character(tab$`dginn-primate_codemlM7M8.p.value`))
tab$`dginn-primate_codemlM7M8.PSS`<-as.numeric(as.character(tab$`dginn-primate_codemlM7M8.PSS`))
tab[tab$Gene.name=="RIPK1","GeneSize"]<-ripk1$GeneSize
tab[tab$Gene.name=="RIPK1","dginn-primate_NbSpecies"]<-ripk1$NbSpecies
tab[tab$Gene.name=="RIPK1","dginn-primate_omegaM0Bpp"]<-ripk1$omegaM0Bpp
tab[tab$Gene.name=="RIPK1","dginn-primate_omegaM0codeml"]<-ripk1$omegaM0codeml
tab[tab$Gene.name=="RIPK1","dginn-primate_BUSTED"]<-ripk1$BUSTED
tab[tab$Gene.name=="RIPK1","dginn-primate_BUSTED.p.value"]<-ripk1$BUSTED_p.value
tab[tab$Gene.name=="RIPK1","dginn-primate_MEME.NbSites"]<-ripk1$MEME_NbSites
tab[tab$Gene.name=="RIPK1","dginn-primate_MEME.PSS"]<-as.numeric(as.character(ripk1$MEME_PSS))
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM1M2"]<-ripk1$BppM1M2
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM1M2.p.value"]<-ripk1$BppM1M2_p.value
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM1M2.NbSites"]<-ripk1$BppM1M2_NbSites
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM1M2.PSS"]<-ripk1$BppM1M2_PSS
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM7M8"]<-ripk1$BppM7M8
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM7M8.p.value"]<-ripk1$BppM7M8_p.value
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM7M8.NbSites"]<-ripk1$BppM7M8_NbSites
tab[tab$Gene.name=="RIPK1","dginn-primate_BppM7M8.PSS"]<-ripk1$BppM7M8_PSS
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM1M2"]<-ripk1$codemlM1M2
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM1M2.p.value"]<-ripk1$codemlM1M2_p.value
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM1M2.NbSites"]<-ripk1$codemlM1M2_NbSites
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM1M2.PSS"]<-ripk1$codemlM1M2_PSS
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM7M8"]<-ripk1$codemlM7M8
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM7M8.p.value"]<-ripk1$codemlM7M8_p.value
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM7M8.NbSites"]<-ripk1$codemlM7M8_NbSites
tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM7M8.PSS"]<-ripk1$codemlM7M8_PSS
## 2. Remove it
dginnbatsnew<-dginnbatsnew[dginnbatsnew$Gene!="RIPK1",]
## suppress redundant lines
dginnbats<-dginnbats[(dginnbats$Gene %in% dginnbatsnew$Gene)==FALSE,]
names(dginnbatsnew)<-names(dginnbats)
##############"
dginnbatsnew[,4]<-as.numeric(dginnbatsnew[,4])
dginnbats[,6]<-as.numeric(as.character(dginnbats[,6]))
dginnbats[,8]<-as.character(dginnbats[,8])
dginnbats[,12]<-as.character(dginnbats[,12])
dginnbats[,13]<-as.numeric(as.character(dginnbats[,13]))
dginnbats[,16]<-as.character(dginnbats[,16])
dginnbats[,17]<-as.numeric(as.character(dginnbats[,17]))
## replace by new data
dginnbats<-rbind(dginnbats, dginnbatsnew)
names(dginnbats)<-c("File", "bats_Name", "cooper.batsGene", paste0("bats_",
names(dginnbats)[-(1:3)]))
names(dginnbats)
tab<-merge(tab,dginnbats, by="cooper.batsGene", all.x=T)
@
\subsection{Write the new table}
<<>>=
write.table(tab, "covid_comp_complete.txt", row.names=FALSE, quote=FALSE, sep="\t")
@
\end{document}
File added
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment