Skip to content
Snippets Groups Projects
Commit 7a67f087 authored by mcariou's avatar mcariou
Browse files

add format script

parent d882bb64
No related branches found
No related tags found
No related merge requests found
\documentclass[11pt, oneside]{article} % use "amsart" instead of "article" for AMSLaTeX format
%\usepackage{geometry} % See geometry.pdf to learn the layout options. There are lots.
%\geometry{letterpaper} % ... or a4paper or a5paper or ...
%\geometry{landscape} % Activate for for rotated page geometry
%\usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent
%\usepackage{graphicx} % Use pdf, png, jpg, or eps with pdflatex; use eps in DVI mode
% TeX will automatically convert eps --> pdf in pdflatex
%\usepackage{amssymb}
\usepackage[utf8]{inputenc}
%\usepackage[cyr]{aeguill}
%\usepackage[francais]{babel}
%\usepackage{hyperref}
\title{Positive selection on genes interacting with SARS-Cov2, Data formating}
\author{Marie Cariou}
\date{Janvier 2021} % Activate to display a given date or no date
\begin{document}
\maketitle
\tableofcontents
\newpage
\section{1st table}
Table containing the DGINN results for both Primates and bats. Conserve all genes.
\subsection{Primates}
Workdir must be adapted to local environment
<<>>=
workdir<-"/home/adminmarie/Documents/CIRI_BIBS_projects/2020_05_Etienne_covid/2020_dginn_covid19/"
#workdir<-getwd()
@
<<>>=
dginnT<-read.delim(paste0(workdir,
"data/DGINN_202005281649summary_cleaned.csv"),
fill=T, h=T, sep=",")
dim(dginnT)
#names(dginnT)
# Rename the columns to include primate
names(dginnT)<-c("File", "Name", "Gene.name", "GeneSize",
"dginn-primate_NbSpecies", "dginn-primate_omegaM0Bpp",
"dginn-primate_omegaM0codeml", "dginn-primate_BUSTED",
"dginn-primate_BUSTED.p.value", "dginn-primate_MEME.NbSites",
"dginn-primate_MEME.PSS", "dginn-primate_BppM1M2",
"dginn-primate_BppM1M2.p.value", "dginn-primate_BppM1M2.NbSites",
"dginn-primate_BppM1M2.PSS", "dginn-primate_BppM7M8",
"dginn-primate_BppM7M8.p.value", "dginn-primate_BppM7M8.NbSites",
"dginn-primate_BppM7M8.PSS", "dginn-primate_codemlM1M2",
"dginn-primate_codemlM1M2.p.value", "dginn-primate_codemlM1M2.NbSites",
"dginn-primate_codemlM1M2.PSS", "dginn-primate_codemlM7M8",
"dginn-primate_codemlM7M8.p.value", "dginn-primate_codemlM7M8.NbSites",
"dginn-primate_codemlM7M8.PSS")
@
Add SELENOS
<<selenos>>=
selenos<-read.delim(paste0(workdir,
"data/resSELENOS.tab"))
# liste of colonne
colonnes<-c("File", "Name", "Gene", "GeneSize",
"NbSpecies", "omegaM0Bpp", "omegaM0codeml", "BUSTED",
"BUSTED_p.value", "MEME_NbSites", "MEME_PSS", "BppM1M2",
"BppM1M2_p.value", "BppM1M2_NbSites", "BppM1M2_PSS", "BppM7M8",
"BppM7M8_p.value", "BppM7M8_NbSites", "BppM7M8_PSS","codemlM1M2",
"codemlM1M2_p.value", "codemlM1M2_NbSites", "codemlM1M2_PSS",
"codemlM7M8", "codemlM7M8_p.value", "codemlM7M8_NbSites",
"codemlM7M8_PSS")
selenos<-selenos[,colonnes]
@
<<>>=
names(selenos)<-names(dginnT)
selenos[,6]<-as.factor(selenos[,6])
selenos[,9]<-as.factor(selenos[,9])
selenos[,11]<-as.factor(selenos[,11])
selenos[,13]<-as.factor(selenos[,13])
selenos[,17]<-as.factor(selenos[,17])
selenos[,21]<-as.factor(selenos[,21])
selenos[,25]<-as.factor(selenos[,25])
## convertir les pvalues
dginnT<-rbind(dginnT, selenos)
@
\subsection{Bats}
<<>>=
# original table
dginnbats<-read.delim(paste0(workdir,
"data/DGINN_202005281339summary_cleaned-LE201108.txt"),
fill=T, h=T)
# rerun on corrected alignment
dginnbatsnew<-read.delim(paste0(workdir,
"data/DGINN_202011262248_hyphybpp-202012192053_codeml-summary.txt"),
fill=T, h=T)
@
<<>>=
# Add both columns
dginnbatsnew$Lucie.s.comments<-""
dginnbatsnew$Action.taken<-""
# Homogenize column names
dginnbats$BUSTED_p.value<-dginnbats$BUSTED.p.value
dginnbats$MEME_NbSites<-dginnbats$MEME.NbSites
dginnbats$MEME_PSS<-dginnbats$MEME.PSS
dginnbats$BppM1M2_p.value<-dginnbats$BppM1M2.p.value
dginnbats$BppM1M2_NbSites<-dginnbats$BppM1M2.NbSites
dginnbats$BppM1M2_PSS<-dginnbats$BppM1M2.PSS
dginnbats$BppM7M8_p.value<-dginnbats$BppM7M8.p.value
dginnbats$BppM7M8_NbSites<-dginnbats$BppM7M8.NbSites
dginnbats$BppM7M8_PSS<-dginnbats$BppM7M8.PSS
dginnbats$codemlM1M2_p.value<-dginnbats$codemlM1M2.p.value
dginnbats$codemlM1M2_NbSites<-dginnbats$codemlM1M2.NbSites
dginnbats$codemlM1M2_PSS<-dginnbats$codemlM1M2.PSS
dginnbats$codemlM7M8_p.value<-dginnbats$codemlM7M8.p.value
dginnbats$codemlM7M8_NbSites<-dginnbats$codemlM7M8.NbSites
dginnbats$codemlM7M8_PSS<-dginnbats$codemlM7M8.PSS
@
<<>>=
# Order columns in the same order in both tables
dginnbats<-dginnbats[,names(dginnbatsnew)]
names(dginnbatsnew) %in% names(dginnbats)
names(dginnbats)==names(dginnbatsnew)
# Put RIPK aside
ripk1<-dginnbatsnew[dginnbatsnew$Gene=="RIPK1",1:27]
# Add it to primate table
names(ripk1)<-names(dginnT)
ripk1$`dginn-primate_omegaM0Bpp`<-as.factor(
ripk1$`dginn-primate_omegaM0Bpp`)
ripk1$`dginn-primate_BUSTED.p.value`<-as.factor(
ripk1$`dginn-primate_BUSTED.p.value`)
ripk1$`dginn-primate_BppM1M2.p.value`<-as.factor(
ripk1$`dginn-primate_BppM1M2.p.value`)
ripk1$`dginn-primate_BppM7M8.p.value`<-as.factor(
ripk1$`dginn-primate_BppM7M8.p.value`)
dginnT<-rbind(dginnT, ripk1)
## Remove it Ripk1 from bats
dginnbatsnew<-dginnbatsnew[dginnbatsnew$Gene!="RIPK1",]
## suppress redundant lines
dginnbats<-dginnbats[(dginnbats$Gene %in% dginnbatsnew$Gene)==FALSE,]
names(dginnbatsnew)<-names(dginnbats)
## replace by new data
dginnbatsnew$omegaM0Bpp<-as.factor(dginnbatsnew$omegaM0Bpp)
dginnbatsnew$BppM1M2_p.value<-as.factor(dginnbatsnew$BppM1M2_p.value)
dginnbatsnew$BppM7M8_p.value<-as.factor(dginnbatsnew$BppM7M8_p.value)
dginnbats<-rbind(dginnbats, dginnbatsnew)
names(dginnbats)<-c("bats_File", "bats_Name", "Gene.name", paste0("bats_",
names(dginnbats)[-(1:3)]))
names(dginnbats)
@
\subsection{Merged table}
<<setup, include=FALSE, cache=FALSE, tidy=TRUE>>=
options(tidy=TRUE, width=70)
@
<<>>=
#tidy.opts = list(width.cutoff = 60)
dim(dginnT)
#dginnT$Gene.name
dim(dginnbats)
#dginnbats$Gene.name
@
Manual corrections:
TMPRSS2 in bats
<<>>=
dginnbats[dginnbats$Gene.name=="TMPRSS2",]
# keeping the uncut one
# renaming the other one TMPRSS2_cut
dginnbats$Gene.name<-as.character(dginnbats$Gene.name)
dginnbats[dginnbats$bats_File==
"TMPRSS2_bat_select_cut_mafft_prank","Gene.name"]<-
"TMPRSS2_cut"
@
RIPK1: ANcestral version kept, suppress it
"RIPK1\_sequences\_filtered\_longestORFs\_mafft\_mincov\_prank"
<<>>=
dginnT<-dginnT[dginnT$File!=
"RIPK1_sequences_filtered_longestORFs_mafft_mincov_prank",]
@
REEP6 eA et B
<<>>=
dginnbats$Gene.name<-as.character(dginnbats$Gene.name)
dginnbats[dginnbats$bats_File==
"REEP6_sequences_filtered_longestORFs_D210gp1_prank", "Gene.name"]<-
"REEP6_old"
dginnbats[dginnbats$bats_File==
"REEP6_LA_bat_select_mafft_prank", "Gene.name"]<-"REEP6"
dginnbats[dginnbats$bats_File==
"REEP6_LB_bat_select_mafft_prank", "Gene.name"]<-"REEP6_like"
@
GNG5
<<>>=
dginnT$Gene.name<-as.character(dginnT$Gene.name)
dginnT[dginnT$File==
"GNG5_sequences_filtered_longestORFs_D189gp2_prank", "Gene.name"]<-
"GNG5_like"
@
<<>>=
dim(dginnbats)
dim(dginnT)
# genes in common
common<-dginnT$Gene.name[dginnT$Gene.name %in% dginnbats$Gene.name]
common
length(dginnT$Gene.name[dginnT$Gene.name %in% dginnbats$Gene.name])
# genes only in primates
onlyprimates<-
dginnT$Gene.name[(dginnT$Gene.name %in% dginnbats$Gene.name)==FALSE]
onlyprimates
length(dginnT$Gene.name[(dginnT$Gene.name %in% dginnbats$Gene.name)==FALSE])
# genes only in bats
onlybats<-
dginnbats$Gene.name[(dginnbats$Gene.name %in% dginnT$Gene.name)==FALSE]
onlybats
length(dginnbats$Gene.name[(dginnbats$Gene.name %in% dginnT$Gene.name)==FALSE])
@
<<>>=
tab<-merge(dginnT, dginnbats, by="Gene.name", all.x=T, all.y=T)
dim(tab)
# add column "shared"/"only bats"/"only primates"
tab$status<-""
tab$status[tab$Gene.name %in% common]<-"shared"
tab$status[tab$Gene.name %in% onlyprimates]<-"onlyprimates"
tab$status[tab$Gene.name %in% onlybats]<-"onlybats"
table(tab$status)
write.table(tab, paste0(
workdir, "out_tab/covid_comp_alldginn.txt"), sep="\t")
@
\section{Complete data}
Merge the previous tab with J Young's original table.
\subsection{Read the original Young table}
<<>>=
young<-read.delim(paste0(workdir,
"data/COVID_PAMLresults_332hits_plusBatScreens_2020_Apr14.csv"),
fill=T, h=T, dec=",")
dim(young)
young$PreyGene<-as.character(young$PreyGene)
young$PreyGene[young$PreyGene=="MTARC1"]<-"MARC1"
@
\subsection{Read the gene names conversion table}
<<>>=
usthem<-read.delim(paste0(workdir,
"/data/table_gene_name_correspondence.csv"),
h=T, sep=";")
young[young$PreyGene %in% usthem$Us, c("PreyGene", "Gene.name")]
usthem[order(usthem$Us),]
@
\subsection{Merge Young and DGINN table}
\textbf{Based on which column?}
How many genes in the Young table are not in the DGINN table. And who are they?
<<>>=
table(young$PreyGene %in% tab$Gene.name)
young[(young$PreyGene %in% tab$Gene.name)==FALSE, "PreyGene"]
tab[(tab$Gene.name %in% young$PreyGene)==FALSE, "Gene.name"]
@
Merge them and keep only the krogan genes
<<>>=
# creation of a dedicated column
young$merge.Gene<-young$PreyGene
tab$merge.Gene<-tab$Gene.name
tablo<-merge(young, tab, by="merge.Gene", all.x=TRUE)
write.table(tablo, paste0(
workdir, "/out_tab/covid_comp_complete.txt"), row.names=FALSE, quote=TRUE, sep="\t")
@
\end{document}
This diff is collapsed.
File added
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment