\documentclass[11pt, oneside]{article} % use "amsart" instead of "article" for AMSLaTeX format %\usepackage{geometry} % See geometry.pdf to learn the layout options. There are lots. %\geometry{letterpaper} % ... or a4paper or a5paper or ... %\geometry{landscape} % Activate for for rotated page geometry %\usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent %\usepackage{graphicx} % Use pdf, png, jpg, or eps with pdflatex; use eps in DVI mode % TeX will automatically convert eps --> pdf in pdflatex %\usepackage{amssymb} \usepackage[utf8]{inputenc} %\usepackage[cyr]{aeguill} %\usepackage[francais]{babel} %\usepackage{hyperref} \title{Positive selection on genes interacting with SARS-Cov2, comparison of different analysis} \author{Marie Cariou} \date{Janvier 2021} % Activate to display a given date or no date \begin{document} \maketitle \tableofcontents \newpage \section{1st table} Table containing the DGINN results for both Primates and bats. Conserve all genes. \subsection{Primates} <<>>= workdir<-"/home/adminmarie/Documents/CIRI_BIBS_projects/2020_05_Etienne_covid/" dginnT<-read.delim(paste0(workdir, "data/DGINN_202005281649summary_cleaned.csv"), fill=T, h=T, sep=",") dim(dginnT) #names(dginnT) # Rename the columns to include primate names(dginnT)<-c("File", "Name", "Gene.name", "GeneSize", "dginn-primate_NbSpecies", "dginn-primate_omegaM0Bpp", "dginn-primate_omegaM0codeml", "dginn-primate_BUSTED", "dginn-primate_BUSTED.p.value", "dginn-primate_MEME.NbSites", "dginn-primate_MEME.PSS", "dginn-primate_BppM1M2", "dginn-primate_BppM1M2.p.value", "dginn-primate_BppM1M2.NbSites", "dginn-primate_BppM1M2.PSS", "dginn-primate_BppM7M8", "dginn-primate_BppM7M8.p.value", "dginn-primate_BppM7M8.NbSites", "dginn-primate_BppM7M8.PSS", "dginn-primate_codemlM1M2", "dginn-primate_codemlM1M2.p.value", "dginn-primate_codemlM1M2.NbSites", "dginn-primate_codemlM1M2.PSS", "dginn-primate_codemlM7M8", "dginn-primate_codemlM7M8.p.value", "dginn-primate_codemlM7M8.NbSites", "dginn-primate_codemlM7M8.PSS") @ \subsection{Bats} <<>>= # original table dginnbats<-read.delim(paste0(workdir, "data/DGINN_202005281339summary_cleaned-LE201108.txt"), fill=T, h=T) # rerun on corrected alignment dginnbatsnew<-read.delim(paste0(workdir, "data/DGINN_202011262248_hyphybpp-202012192053_codeml-summary.txt"), fill=T, h=T) @ <<>>= # Add both columns dginnbatsnew$Lucie.s.comments<-"" dginnbatsnew$Action.taken<-"" # Homogenize column names dginnbats$BUSTED_p.value<-dginnbats$BUSTED.p.value dginnbats$MEME_NbSites<-dginnbats$MEME.NbSites dginnbats$MEME_PSS<-dginnbats$MEME.PSS dginnbats$BppM1M2_p.value<-dginnbats$BppM1M2.p.value dginnbats$BppM1M2_NbSites<-dginnbats$BppM1M2.NbSites dginnbats$BppM1M2_PSS<-dginnbats$BppM1M2.PSS dginnbats$BppM7M8_p.value<-dginnbats$BppM7M8.p.value dginnbats$BppM7M8_NbSites<-dginnbats$BppM7M8.NbSites dginnbats$BppM7M8_PSS<-dginnbats$BppM7M8.PSS dginnbats$codemlM1M2_p.value<-dginnbats$codemlM1M2.p.value dginnbats$codemlM1M2_NbSites<-dginnbats$codemlM1M2.NbSites dginnbats$codemlM1M2_PSS<-dginnbats$codemlM1M2.PSS dginnbats$codemlM7M8_p.value<-dginnbats$codemlM7M8.p.value dginnbats$codemlM7M8_NbSites<-dginnbats$codemlM7M8.NbSites dginnbats$codemlM7M8_PSS<-dginnbats$codemlM7M8.PSS @ <<>>= # Order columns in the same order in both tables dginnbats<-dginnbats[,names(dginnbatsnew)] names(dginnbatsnew) %in% names(dginnbats) names(dginnbats)==names(dginnbatsnew) # Put RIPK aside ripk1<-dginnbatsnew[dginnbatsnew$Gene=="RIPK1",1:27] # Add it to primate table names(ripk1)<-names(dginnT) ripk1$`dginn-primate_omegaM0Bpp`<-as.factor(ripk1$`dginn-primate_omegaM0Bpp`) ripk1$`dginn-primate_BUSTED.p.value`<-as.factor(ripk1$`dginn-primate_BUSTED.p.value`) ripk1$`dginn-primate_BppM1M2.p.value`<-as.factor(ripk1$`dginn-primate_BppM1M2.p.value`) ripk1$`dginn-primate_BppM7M8.p.value`<-as.factor(ripk1$`dginn-primate_BppM7M8.p.value`) dginnT<-rbind(dginnT, ripk1) ## Remove it Ripk1 from bats dginnbatsnew<-dginnbatsnew[dginnbatsnew$Gene!="RIPK1",] ## suppress redundant lines dginnbats<-dginnbats[(dginnbats$Gene %in% dginnbatsnew$Gene)==FALSE,] names(dginnbatsnew)<-names(dginnbats) ## replace by new data dginnbatsnew$omegaM0Bpp<-as.factor(dginnbatsnew$omegaM0Bpp) dginnbatsnew$BppM1M2_p.value<-as.factor(dginnbatsnew$BppM1M2_p.value) dginnbatsnew$BppM7M8_p.value<-as.factor(dginnbatsnew$BppM7M8_p.value) dginnbats<-rbind(dginnbats, dginnbatsnew) names(dginnbats)<-c("bats_File", "bats_Name", "Gene.name", paste0("bats_", names(dginnbats)[-(1:3)])) names(dginnbats) @ \subsection{Merged table} <<setup, include=FALSE, cache=FALSE, tidy=TRUE>>= options(tidy=TRUE, width=70) @ <<>>= #tidy.opts = list(width.cutoff = 60) dim(dginnT) #dginnT$Gene.name dim(dginnbats) #dginnbats$Gene.name @ Manual corrections: TMPRSS2 in bats <<>>= dginnbats[dginnbats$Gene.name=="TMPRSS2",] # keeping the uncut one # renaming the other one TMPRSS2_cut dginnbats$Gene.name<-as.character(dginnbats$Gene.name) dginnbats[dginnbats$bats_File=="TMPRSS2_bat_select_cut_mafft_prank","Gene.name"]<-"TMPRSS2_cut" @ RIPK1: ANcestral version kept, suppress it "RIPK1\_sequences\_filtered\_longestORFs\_mafft\_mincov\_prank" <<>>= dginnT<-dginnT[dginnT$File!="RIPK1_sequences_filtered_longestORFs_mafft_mincov_prank",] @ REEP6 eA et B <<>>= dginnbats$Gene.name<-as.character(dginnbats$Gene.name) dginnbats[dginnbats$bats_File=="REEP6_sequences_filtered_longestORFs_D210gp1_prank", "Gene.name"]<-"REEP6_old" dginnbats[dginnbats$bats_File=="REEP6_LA_bat_select_mafft_prank", "Gene.name"]<-"REEP6" dginnbats[dginnbats$bats_File=="REEP6_LB_bat_select_mafft_prank", "Gene.name"]<-"REEP6_like" @ GNG5 <<>>= dginnT$Gene.name<-as.character(dginnT$Gene.name) dginnT[dginnT$File=="GNG5_sequences_filtered_longestORFs_D189gp2_prank", "Gene.name"]<-"GNG5_like" @ <<>>= dim(dginnbats) dim(dginnT) # genes in common common<-dginnT$Gene.name[dginnT$Gene.name %in% dginnbats$Gene.name] common length(dginnT$Gene.name[dginnT$Gene.name %in% dginnbats$Gene.name]) # genes only in primates onlyprimates<-dginnT$Gene.name[(dginnT$Gene.name %in% dginnbats$Gene.name)==FALSE] onlyprimates length(dginnT$Gene.name[(dginnT$Gene.name %in% dginnbats$Gene.name)==FALSE]) # genes only in bats onlybats<-dginnbats$Gene.name[(dginnbats$Gene.name %in% dginnT$Gene.name)==FALSE] onlybats length(dginnbats$Gene.name[(dginnbats$Gene.name %in% dginnT$Gene.name)==FALSE]) @ <<>>= tab<-merge(dginnT, dginnbats, by="Gene.name", all.x=T, all.y=T) dim(tab) # add column "shared"/"only bats"/"only primates" tab$status<-"" tab$status[tab$Gene.name %in% common]<-"shared" tab$status[tab$Gene.name %in% onlyprimates]<-"onlyprimates" tab$status[tab$Gene.name %in% onlybats]<-"onlybats" table(tab$status) write.table(tab, "covid_comp_alldginn.txt", sep="\t") @ \section{Complete data} Merge the previous tab with J Young's original table. \subsection{Read the original Young table} <<>>= young<-read.delim(paste0(workdir, "data/COVID_PAMLresults_332hits_plusBatScreens_2020_Apr14.csv"), fill=T, h=T, dec=",") dim(young) young$PreyGene<-as.character(young$PreyGene) young$PreyGene[young$PreyGene=="MTARC1"]<-"MARC1" @ \subsection{Read the gene names conversion table} <<>>= usthem<-read.delim(paste0(workdir, "/data/table_gene_name_correspondence.csv"), h=T, sep=";") young[young$PreyGene %in% usthem$Us, c("PreyGene", "Gene.name")] usthem[order(usthem$Us),] @ \subsection{Merge Young and DGINN table} \textbf{Based on which column?} How many genes in the Young table are not in the DGINN table. And who are they? <<>>= table(young$PreyGene %in% tab$Gene.name) young[(young$PreyGene %in% tab$Gene.name)==FALSE, "PreyGene"] tab[(tab$Gene.name %in% young$PreyGene)==FALSE, "Gene.name"] @ Merge them and keep only the krogan genes <<>>= # creation of a dedicated column young$merge.Gene<-young$PreyGene tab$merge.Gene<-tab$Gene.name tablo<-merge(young, tab, by="merge.Gene", all.x=TRUE) write.table(tablo, "covid_comp_complete.txt", row.names=FALSE, quote=TRUE, sep="\t") @ \end{document}