\documentclass[11pt, oneside]{article} % use "amsart" instead of "article" for AMSLaTeX format %\usepackage{geometry} % See geometry.pdf to learn the layout options. There are lots. %\geometry{letterpaper} % ... or a4paper or a5paper or ... %\geometry{landscape} % Activate for for rotated page geometry %\usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent %\usepackage{graphicx} % Use pdf, png, jpg, or eps with pdflatex; use eps in DVI mode % TeX will automatically convert eps --> pdf in pdflatex %\usepackage{amssymb} \usepackage[utf8]{inputenc} %\usepackage[cyr]{aeguill} %\usepackage[francais]{babel} %\usepackage{hyperref} \title{Positive selection on genes interacting with SARS-Cov2, comparison of different analysis} \author{Marie Cariou} \date{Janvier 2021} % Activate to display a given date or no date \begin{document} \maketitle \tableofcontents \newpage \section{Files manipulations} \subsection{Read Janet Young's table} <<>>= workdir<-"/home/adminmarie/Documents/CIRI_BIBS_projects/2020_05_Etienne_covid/" tab<-read.delim(paste0(workdir, "data/COVID_PAMLresults_332hits_plusBatScreens_2020_Apr14.csv"), fill=T, h=T, dec=",") dim(tab) @ \subsection{Read DGINN Young table} <<>>= dginnY<-read.delim(paste0(workdir, "data/summary_primate_young.res"), fill=T, h=T) dim(dginnY) @ \subsection{Joining Young and DGINN Young table} <<>>= # correct gene names (MARC1) val_remp=as.character(unique(dginnY$Gene)[(unique(dginnY$Gene) %in% tab$Gene.name)==F]) tab$Gene.name<-as.character(tab$Gene.name) tab$Gene.name[158]<-val_remp sum(unique(dginnY$Gene) %in% unique(tab$Gene.name)) @ <<>>= add_col<-function(method="PamlM1M2"){ tmp<-dginnY[dginnY$Method==method, c("Gene", "Omega", "PosSel", "PValue", "NbSites", "PSS")] names(tmp)<-c("Gene.name", paste0("Omega_", method), paste0("PosSel_", method), paste0("PValue_", method), paste0("NbSites_", method), paste0("PSS_", method)) tab<-merge(tab, tmp, by="Gene.name") return(tab) } tab<-add_col("PamlM1M2") tab<-add_col("PamlM7M8") tab<-add_col("BppM1M2") tab<-add_col("BppM7M8") # Manip pour la colonne BUSTED tmp<-dginnY[dginnY$Method=="BUSTED",c("Gene", "Omega", "PosSel", "PValue")] names(tmp)<-c("Gene.name", "Omega_BUSTED", "PosSel_BUSTED", "PValue_BUSTED") tab<-merge(tab, tmp, by="Gene.name") tmp<-dginnY[dginnY$Method=="MEME",c("Gene", "NbSites", "PSS")] names(tmp)<-c("Gene.name", "NbSites_MEME", "PSS_MEME") tab<-merge(tab, tmp, by="Gene.name") @ \subsection{Read DGINN Table} <<>>= dginnT<-read.delim(paste0(workdir, "data/DGINN_202005281649summary_cleaned.csv"), fill=T, h=T, sep=",") dim(dginnT) names(dginnT) # Number of genes in dginn-primate output not present in the original table dginnT[(dginnT$Gene %in% tab$Gene.name)==F,"Gene"] # This includes paralogs, recombinations found by DGINN and additionnal genes # included on purpose # Number of genes from the original list not present in DGINN output tab[(tab$Gene.name %in% dginnT$Gene)==F,"Gene.name"] names(dginnT)<-c("File", "Name", "Gene.name", "GeneSize", "dginn-primate_NbSpecies", "dginn-primate_omegaM0Bpp", "dginn-primate_omegaM0codeml", "dginn-primate_BUSTED", "dginn-primate_BUSTED.p.value", "dginn-primate_MEME.NbSites", "dginn-primate_MEME.PSS", "dginn-primate_BppM1M2", "dginn-primate_BppM1M2.p.value", "dginn-primate_BppM1M2.NbSites", "dginn-primate_BppM1M2.PSS", "dginn-primate_BppM7M8", "dginn-primate_BppM7M8.p.value", "dginn-primate_BppM7M8.NbSites", "dginn-primate_BppM7M8.PSS", "dginn-primate_codemlM1M2", "dginn-primate_codemlM1M2.p.value", "dginn-primate_codemlM1M2.NbSites", "dginn-primate_codemlM1M2.PSS", "dginn-primate_codemlM7M8", "dginn-primate_codemlM7M8.p.value", "dginn-primate_codemlM7M8.NbSites", "dginn-primate_codemlM7M8.PSS") @ <<eval=FALSE>>= table(dginnT$`dginn-primate_BUSTED`) table(dginnT$`dginn-primate_codemlM1M2`) table(dginnT$`dginn-primate_codemlM7M8`) table(dginnT$`dginn-primate_BppM1M2`) table(dginnT$`dginn-primate_BppM7M8`) table(dginnT$`dginn-primate_BUSTED`=="na",dginnT$`dginn-primate_codemlM1M2`=="na", dginnT$`dginn-primate_codemlM7M8`=="na", dginnT$`dginn-primate_BppM1M2`=="na", dginnT$`dginn-primate_BppM7M8`=="na" ) @ \subsection{Join Table and DGINN table} <<>>= tab<-merge(tab,dginnT, by="Gene.name", all.x=T) table(tab$`dginn-primate_BUSTED`) table(tab$`dginn-primate_codemlM1M2`) table(tab$`dginn-primate_codemlM7M8`) table(tab$`dginn-primate_BppM1M2`) table(tab$`dginn-primate_BppM7M8`) table(tab$`dginn-primate_BUSTED`=="na" | tab$`dginn-primate_codemlM1M2`=="na" | tab$`dginn-primate_codemlM7M8`=="na" | tab$`dginn-primate_BppM1M2`=="na"| tab$`dginn-primate_BppM7M8`=="na" ) @ \subsection{Add DGINN results on bat dataset} DGINN results from different analysis. <<>>= # original table dginnbats<-read.delim(paste0(workdir, "data/DGINN_202005281339summary_cleaned.tab"), fill=T, h=T) # rerun on corrected alignment dginnbatsnew1<-read.delim(paste0(workdir, "data/DGINN_202011262248_summary.tab"), fill=T, h=T) dginnbatsnew2<-read.delim(paste0(workdir, "data/DGINN_202012192053_summary.tab"), fill=T, h=T) # colomne choice, BUSTED and Bppml form first file, codeml from the other one dginnbatsnew<-dginnbatsnew1 dginnbatsnew$omegaM0codeml<-dginnbatsnew2$omegaM0codeml dginnbatsnew$codemlM1M2<-dginnbatsnew2$codemlM1M2 dginnbatsnew$codemlM1M2_p.value<-dginnbatsnew2$codemlM1M2_p.value dginnbatsnew$codemlM1M2_NbSites<-dginnbatsnew2$codemlM1M2_NbSites dginnbatsnew$codemlM1M2_PSS<-dginnbatsnew2$codemlM1M2_PSS dginnbatsnew$codemlM7M8<-dginnbatsnew2$codemlM7M8 dginnbatsnew$codemlM7M8_p.value<-dginnbatsnew2$codemlM7M8_p.value dginnbatsnew$codemlM7M8_NbSites<-dginnbatsnew2$codemlM7M8_NbSites dginnbatsnew$codemlM7M8_PSS<-dginnbatsnew2$codemlM7M8_PSS #### ## RIPK1 is actually a primat results ## 1. Take it and put it at the right place ripk1<-as.vector(dginnbatsnew[dginnbatsnew$Gene=="RIPK1",]) tab$`dginn-primate_omegaM0Bpp`<-as.numeric(as.character(tab$`dginn-primate_omegaM0Bpp`)) tab$`dginn-primate_BUSTED.p.value`<-as.numeric(as.character(tab$`dginn-primate_BUSTED.p.value`)) tab$`dginn-primate_BppM1M2.p.value`<-as.numeric(as.character(tab$`dginn-primate_BppM1M2.p.value`)) tab$`dginn-primate_BppM7M8.p.value`<-as.numeric(as.character(tab$`dginn-primate_BppM7M8.p.value`)) tab$`dginn-primate_BppM7M8.PSS`<-as.numeric(as.character(tab$`dginn-primate_BppM7M8.PSS`)) tab$`dginn-primate_codemlM1M2.p.value`<-as.numeric(as.character(tab$`dginn-primate_codemlM1M2.p.value`)) tab$`dginn-primate_codemlM1M2.PSS`<-as.numeric(as.character(tab$`dginn-primate_codemlM1M2.PSS`)) tab$`dginn-primate_codemlM7M8.p.value`<-as.numeric(as.character(tab$`dginn-primate_codemlM7M8.p.value`)) tab$`dginn-primate_codemlM7M8.PSS`<-as.numeric(as.character(tab$`dginn-primate_codemlM7M8.PSS`)) tab[tab$Gene.name=="RIPK1","GeneSize"]<-ripk1$GeneSize tab[tab$Gene.name=="RIPK1","dginn-primate_NbSpecies"]<-ripk1$NbSpecies tab[tab$Gene.name=="RIPK1","dginn-primate_omegaM0Bpp"]<-ripk1$omegaM0Bpp tab[tab$Gene.name=="RIPK1","dginn-primate_omegaM0codeml"]<-ripk1$omegaM0codeml tab[tab$Gene.name=="RIPK1","dginn-primate_BUSTED"]<-ripk1$BUSTED tab[tab$Gene.name=="RIPK1","dginn-primate_BUSTED.p.value"]<-ripk1$BUSTED_p.value tab[tab$Gene.name=="RIPK1","dginn-primate_MEME.NbSites"]<-ripk1$MEME_NbSites tab[tab$Gene.name=="RIPK1","dginn-primate_MEME.PSS"]<-as.numeric(as.character(ripk1$MEME_PSS)) tab[tab$Gene.name=="RIPK1","dginn-primate_BppM1M2"]<-ripk1$BppM1M2 tab[tab$Gene.name=="RIPK1","dginn-primate_BppM1M2.p.value"]<-ripk1$BppM1M2_p.value tab[tab$Gene.name=="RIPK1","dginn-primate_BppM1M2.NbSites"]<-ripk1$BppM1M2_NbSites tab[tab$Gene.name=="RIPK1","dginn-primate_BppM1M2.PSS"]<-ripk1$BppM1M2_PSS tab[tab$Gene.name=="RIPK1","dginn-primate_BppM7M8"]<-ripk1$BppM7M8 tab[tab$Gene.name=="RIPK1","dginn-primate_BppM7M8.p.value"]<-ripk1$BppM7M8_p.value tab[tab$Gene.name=="RIPK1","dginn-primate_BppM7M8.NbSites"]<-ripk1$BppM7M8_NbSites tab[tab$Gene.name=="RIPK1","dginn-primate_BppM7M8.PSS"]<-ripk1$BppM7M8_PSS tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM1M2"]<-ripk1$codemlM1M2 tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM1M2.p.value"]<-ripk1$codemlM1M2_p.value tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM1M2.NbSites"]<-ripk1$codemlM1M2_NbSites tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM1M2.PSS"]<-ripk1$codemlM1M2_PSS tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM7M8"]<-ripk1$codemlM7M8 tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM7M8.p.value"]<-ripk1$codemlM7M8_p.value tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM7M8.NbSites"]<-ripk1$codemlM7M8_NbSites tab[tab$Gene.name=="RIPK1","dginn-primate_codemlM7M8.PSS"]<-ripk1$codemlM7M8_PSS ## 2. Remove it dginnbatsnew<-dginnbatsnew[dginnbatsnew$Gene!="RIPK1",] ## suppress redundant lines dginnbats<-dginnbats[(dginnbats$Gene %in% dginnbatsnew$Gene)==FALSE,] names(dginnbatsnew)<-names(dginnbats) ##############" dginnbatsnew[,4]<-as.numeric(dginnbatsnew[,4]) dginnbats[,6]<-as.numeric(as.character(dginnbats[,6])) dginnbats[,8]<-as.character(dginnbats[,8]) dginnbats[,12]<-as.character(dginnbats[,12]) dginnbats[,13]<-as.numeric(as.character(dginnbats[,13])) dginnbats[,16]<-as.character(dginnbats[,16]) dginnbats[,17]<-as.numeric(as.character(dginnbats[,17])) ## replace by new data dginnbats<-rbind(dginnbats, dginnbatsnew) names(dginnbats)<-c("File", "bats_Name", "cooper.batsGene", paste0("bats_", names(dginnbats)[-(1:3)])) names(dginnbats) tab<-merge(tab,dginnbats, by="cooper.batsGene", all.x=T) @ \subsection{Write the new table} <<>>= write.table(tab, "covid_comp_complete_old.txt", row.names=FALSE, quote=FALSE, sep="\t") @ \section{Second Table} Table containing the DGINN results for both Primates and bats. Conserve all genes. \subsection{Primates} <<>>= dginnT<-read.delim(paste0(workdir, "data/DGINN_202005281649summary_cleaned.csv"), fill=T, h=T, sep=",") dim(dginnT) names(dginnT) # Rename the columns to include primate names(dginnT)<-c("File", "Name", "Gene.name", "GeneSize", "dginn-primate_NbSpecies", "dginn-primate_omegaM0Bpp", "dginn-primate_omegaM0codeml", "dginn-primate_BUSTED", "dginn-primate_BUSTED.p.value", "dginn-primate_MEME.NbSites", "dginn-primate_MEME.PSS", "dginn-primate_BppM1M2", "dginn-primate_BppM1M2.p.value", "dginn-primate_BppM1M2.NbSites", "dginn-primate_BppM1M2.PSS", "dginn-primate_BppM7M8", "dginn-primate_BppM7M8.p.value", "dginn-primate_BppM7M8.NbSites", "dginn-primate_BppM7M8.PSS", "dginn-primate_codemlM1M2", "dginn-primate_codemlM1M2.p.value", "dginn-primate_codemlM1M2.NbSites", "dginn-primate_codemlM1M2.PSS", "dginn-primate_codemlM7M8", "dginn-primate_codemlM7M8.p.value", "dginn-primate_codemlM7M8.NbSites", "dginn-primate_codemlM7M8.PSS") @ \subsection{Bats} <<>>= # original table dginnbats<-read.delim(paste0(workdir, "data/DGINN_202005281339summary_cleaned-LE201108.txt"), fill=T, h=T) # rerun on corrected alignment dginnbatsnew<-read.delim(paste0(workdir, "data/DGINN_202011262248_hyphybpp-202012192053_codeml-summary.txt"), fill=T, h=T) @ <<>>= # Add both columns dginnbatsnew$Lucie.s.comments<-"" dginnbatsnew$Action.taken<-"" # Homogenize column names dginnbats$BUSTED_p.value<-dginnbats$BUSTED.p.value dginnbats$MEME_NbSites<-dginnbats$MEME.NbSites dginnbats$MEME_PSS<-dginnbats$MEME.PSS dginnbats$BppM1M2_p.value<-dginnbats$BppM1M2.p.value dginnbats$BppM1M2_NbSites<-dginnbats$BppM1M2.NbSites dginnbats$BppM1M2_PSS<-dginnbats$BppM1M2.PSS dginnbats$BppM7M8_p.value<-dginnbats$BppM7M8.p.value dginnbats$BppM7M8_NbSites<-dginnbats$BppM7M8.NbSites dginnbats$BppM7M8_PSS<-dginnbats$BppM7M8.PSS dginnbats$codemlM1M2_p.value<-dginnbats$codemlM1M2.p.value dginnbats$codemlM1M2_NbSites<-dginnbats$codemlM1M2.NbSites dginnbats$codemlM1M2_PSS<-dginnbats$codemlM1M2.PSS dginnbats$codemlM7M8_p.value<-dginnbats$codemlM7M8.p.value dginnbats$codemlM7M8_NbSites<-dginnbats$codemlM7M8.NbSites dginnbats$codemlM7M8_PSS<-dginnbats$codemlM7M8.PSS @ <<>>= # Order columns in the same order in both tables dginnbats<-dginnbats[,names(dginnbatsnew)] names(dginnbatsnew) %in% names(dginnbats) names(dginnbats)==names(dginnbatsnew) # Put RIPK aside ripk1<-dginnbatsnew[dginnbatsnew$Gene=="RIPK1",1:27] # Add it to primate table names(ripk1)<-names(dginnT) ripk1$`dginn-primate_omegaM0Bpp`<-as.factor(ripk1$`dginn-primate_omegaM0Bpp`) ripk1$`dginn-primate_BUSTED.p.value`<-as.factor(ripk1$`dginn-primate_BUSTED.p.value`) ripk1$`dginn-primate_BppM1M2.p.value`<-as.factor(ripk1$`dginn-primate_BppM1M2.p.value`) ripk1$`dginn-primate_BppM7M8.p.value`<-as.factor(ripk1$`dginn-primate_BppM7M8.p.value`) dginnT<-rbind(dginnT, ripk1) ## Remove it Ripk1 from bats dginnbatsnew<-dginnbatsnew[dginnbatsnew$Gene!="RIPK1",] ## suppress redundant lines dginnbats<-dginnbats[(dginnbats$Gene %in% dginnbatsnew$Gene)==FALSE,] names(dginnbatsnew)<-names(dginnbats) ## replace by new data dginnbatsnew$omegaM0Bpp<-as.factor(dginnbatsnew$omegaM0Bpp) dginnbatsnew$BppM1M2_p.value<-as.factor(dginnbatsnew$BppM1M2_p.value) dginnbatsnew$BppM7M8_p.value<-as.factor(dginnbatsnew$BppM7M8_p.value) dginnbats<-rbind(dginnbats, dginnbatsnew) names(dginnbats)<-c("bats_File", "bats_Name", "Gene.name", paste0("bats_", names(dginnbats)[-(1:3)])) names(dginnbats) @ \subsection{Merged table} <<setup, include=FALSE, cache=FALSE, tidy=TRUE>>= options(tidy=TRUE, width=70) @ <<>>= #tidy.opts = list(width.cutoff = 60) dim(dginnT) dginnT$Gene.name dim(dginnbats) dginnbats$Gene.name @ Manual corrections: TMPRSS2 in bats <<>>= dginnbats[dginnbats$Gene.name=="TMPRSS2",] # keeping the uncut one # renaming the other one TMPRSS2_cut dginnbats$Gene.name<-as.character(dginnbats$Gene.name) dginnbats[dginnbats$bats_File=="TMPRSS2_bat_select_cut_mafft_prank","Gene.name"]<-"TMPRSS2_cut" @ RIPK1: ANcestral version kept, suppress it "RIPK1\_sequences\_filtered\_longestORFs\_mafft\_mincov\_prank" <<>>= dginnT<-dginnT[dginnT$File!="RIPK1_sequences_filtered_longestORFs_mafft_mincov_prank",] @ REEP6 eA et B <<>>= dginnbats$Gene.name<-as.character(dginnbats$Gene.name) dginnbats[dginnbats$bats_File=="REEP6_sequences_filtered_longestORFs_D210gp1_prank", "Gene.name"]<-"REEP6_old" dginnbats[dginnbats$bats_File=="REEP6_LA_bat_select_mafft_prank", "Gene.name"]<-"REEP6" dginnbats[dginnbats$bats_File=="REEP6_LB_bat_select_mafft_prank", "Gene.name"]<-"REEP6_like" @ GNG5 <<>>= dginnT$Gene.name<-as.character(dginnT$Gene.name) dginnT[dginnT$File=="GNG5_sequences_filtered_longestORFs_D189gp2_prank", "Gene.name"]<-"GNG5_like" @ <<>>= dim(dginnbats) dim(dginnT) # genes in common common<-dginnT$Gene.name[dginnT$Gene.name %in% dginnbats$Gene.name] common length(dginnT$Gene.name[dginnT$Gene.name %in% dginnbats$Gene.name]) # genes only in primates onlyprimates<-dginnT$Gene.name[(dginnT$Gene.name %in% dginnbats$Gene.name)==FALSE] onlyprimates length(dginnT$Gene.name[(dginnT$Gene.name %in% dginnbats$Gene.name)==FALSE]) # genes only in bats onlybats<-dginnbats$Gene.name[(dginnbats$Gene.name %in% dginnT$Gene.name)==FALSE] onlybats length(dginnbats$Gene.name[(dginnbats$Gene.name %in% dginnT$Gene.name)==FALSE]) @ <<>>= tab<-merge(dginnT, dginnbats, by="Gene.name", all.x=T, all.y=T) dim(tab) # add column "shared"/"only bats"/"only primates" tab$status<-"" tab$status[tab$Gene.name %in% common]<-"shared" tab$status[tab$Gene.name %in% onlyprimates]<-"onlyprimates" tab$status[tab$Gene.name %in% onlybats]<-"onlybats" table(tab$status) write.table(tab, "covid_comp_alldginn.txt", sep="\t") @ \section{Complete data} Merge the previous tab with J Young's original table. \textbf{Will replace the 1st part of this script} <<>>= young<-read.delim(paste0(workdir, "data/COVID_PAMLresults_332hits_plusBatScreens_2020_Apr14.csv"), fill=T, h=T, dec=",") dim(young) young$PreyGene<-as.character(young$PreyGene) young$PreyGene[young$PreyGene=="MTARC1"]<-"MARC1" @ How many genes in the Young table are not in the DGINN table. And who are they? <<>>= table(young$PreyGene %in% tab$Gene.name) young[(young$PreyGene %in% tab$Gene.name)==FALSE, "PreyGene"] tab[(tab$Gene.name %in% young$PreyGene)==FALSE, "Gene.name"] @ Merge them and keep only the krogan genes <<>>= tablo<-merge(young, tab, by="Gene.name", all.x=TRUE) write.table(tablo, "covid_comp_complete.txt", row.names=FALSE, quote=TRUE, sep="\t") @ \end{document}