Commit 59d4b779 authored by mcariou's avatar mcariou
Browse files

update report 1

parent b21e37c2
......@@ -62,7 +62,7 @@ Data from the first publication:
\url{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5050043/}
\url{https://www.ncbi.nlm.nih.gov/bioproject/PRJNA285910}
<<>>=
home<-"~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/phylolegio/"
home<-"~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/"
data<-paste0(home, "data/")
datalist1<-list.files(data, pattern="NIHMS817051")
......@@ -76,14 +76,19 @@ head(tab1)
@
<<>>=
tabPRJNA<-read.table(paste0(data, "PRJNA285910_AssemblyDetails.txt"), header=FALSE, sep="\t", skip=2, fill=TRUE)
tabPRJNA<-read.table(paste0(data, "PRJNA285910_AssemblyDetails.txt"), header=FALSE, sep="\t", skip=2, fill=TRUE, comment.char = "")
tabPRJNA<-tabPRJNA[,1:6]
names(tabPRJNA)<-c("Assembly", "Level", "WGS", "BioSample", "Strain", "Taxonomy")
@
\url{https://pubmed.ncbi.nlm.nih.gov/33881638/}
<<>>=
<<eval=FALSE>>=
datalist2<-list.files(data, pattern="Gupta")
datalist2
......@@ -100,7 +105,29 @@ head(tab1)
@
Je ne parviens pas à retrouver la correspondance entre les numéro d'accession fourni dans le tableau 1 de la publi et des numéros d'accession d'assemblage GEnBank
Which species are present in gupta not in the previous datasets.
<<>>=
species<-sub(tab1$Species, pattern = "L. ", replacement = "")
test<-sapply(gupta1$Species, function(x){
is.there<-sum(sapply(species, function(y) grepl(x,pattern = y)))
if (is.there>0){
return(TRUE)
}else{
return(FALSE)
}
})
# list of species I need to find assembly for:
list<-gupta1$Species[test==FALSE]
## Keep only legio and coxiella
list<-c(list[grep(list, pattern="Legionella")], list[grep(list, pattern="Coxiella")])
@
<<>>=
ncbi<-read.table(paste0(data, "tab_ncbi_file.csv"), sep=";", fill=TRUE, header=TRUE, comment.char = "")
......
This diff is collapsed.
\documentclass[11pt, oneside]{article}\usepackage[]{graphicx}\usepackage[]{color}
% maxwidth is the original width if it is less than linewidth
% otherwise use linewidth (to make sure the graphics do not exceed the margin)
\makeatletter
\def\maxwidth{ %
\ifdim\Gin@nat@width>\linewidth
\linewidth
\else
\Gin@nat@width
\fi
}
\makeatother
\definecolor{fgcolor}{rgb}{0.345, 0.345, 0.345}
\newcommand{\hlnum}[1]{\textcolor[rgb]{0.686,0.059,0.569}{#1}}%
\newcommand{\hlstr}[1]{\textcolor[rgb]{0.192,0.494,0.8}{#1}}%
\newcommand{\hlcom}[1]{\textcolor[rgb]{0.678,0.584,0.686}{\textit{#1}}}%
\newcommand{\hlopt}[1]{\textcolor[rgb]{0,0,0}{#1}}%
\newcommand{\hlstd}[1]{\textcolor[rgb]{0.345,0.345,0.345}{#1}}%
\newcommand{\hlkwa}[1]{\textcolor[rgb]{0.161,0.373,0.58}{\textbf{#1}}}%
\newcommand{\hlkwb}[1]{\textcolor[rgb]{0.69,0.353,0.396}{#1}}%
\newcommand{\hlkwc}[1]{\textcolor[rgb]{0.333,0.667,0.333}{#1}}%
\newcommand{\hlkwd}[1]{\textcolor[rgb]{0.737,0.353,0.396}{\textbf{#1}}}%
\let\hlipl\hlkwb
\usepackage{framed}
\makeatletter
\newenvironment{kframe}{%
\def\at@end@of@kframe{}%
\ifinner\ifhmode%
\def\at@end@of@kframe{\end{minipage}}%
\begin{minipage}{\columnwidth}%
\fi\fi%
\def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep
\colorbox{shadecolor}{##1}\hskip-\fboxsep
% There is no \\@totalrightmargin, so:
\hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}%
\MakeFramed {\advance\hsize-\width
\@totalleftmargin\z@ \linewidth\hsize
\@setminipage}}%
{\par\unskip\endMakeFramed%
\at@end@of@kframe}
\makeatother
\definecolor{shadecolor}{rgb}{.97, .97, .97}
\definecolor{messagecolor}{rgb}{0, 0, 0}
\definecolor{warningcolor}{rgb}{1, 0, 1}
\definecolor{errorcolor}{rgb}{1, 0, 0}
\newenvironment{knitrout}{}{} % an empty environment to be redefined in TeX
\usepackage{alltt} % use "amsart" instead of "article" for AMSLaTeX format
%\usepackage{geometry} % See geometry.pdf to learn the layout options. There are lots.
%\geometry{letterpaper} % ... or a4paper or a5paper or ...
%\geometry{landscape} % Activate for for rotated page geometry
%\usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent
%\usepackage{graphicx} % Use pdf, png, jpg, or eps with pdflatex; use eps in DVI mode
% TeX will automatically convert eps --> pdf in pdflatex
%\usepackage{amssymb}
\usepackage[utf8]{inputenc}
%\usepackage[cyr]{aeguill}
%\usepackage[francais]{babel}
%\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{titling}
\usepackage{listings}
\usepackage{upquote}
\usepackage{hyperref}
\usepackage{xcolor}
%\definecolor{gray}{rgb}{0.5,0.5,0.5}
\lstnewenvironment{code}[1][]{
\lstset{
% upquote=true,
columns=flexible,
basicstyle=\ttfamily,
language=[LaTeX]TeX,
texcsstyle=*\color{blue},
commentstyle=\color{gray},
frame=single,
rulecolor=\color{green!5},
backgroundcolor=\color{green!5},
}
}{}
\title{Reference phylogeny for Legionella}
\author{Marie Cariou}
\date{Septembre 2021} % Activate to display a given date or no date
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\begin{document}
\maketitle
\tableofcontents
\newpage
\section{Introduction}
\subsection{Objective}
We aim at the reconstruction of a reference phylogeny for all legionella species (contained in Bustein et al. 2016 or Gupta et al. 2020) and 5 selected strains of \textit{legionella pneumophila}
We chose to use the 78 genes selected by Burstein et al. 2016. The first objective is then to recover individual gene sequences for all these genomes.
\subsection{Data}
Data from the first publication:
\url{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5050043/}
\url{https://www.ncbi.nlm.nih.gov/bioproject/PRJNA285910}
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{home}\hlkwb{<-}\hlstr{"~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/"}
\hlstd{data}\hlkwb{<-}\hlkwd{paste0}\hlstd{(home,} \hlstr{"data/"}\hlstd{)}
\hlstd{datalist1}\hlkwb{<-}\hlkwd{list.files}\hlstd{(data,} \hlkwc{pattern}\hlstd{=}\hlstr{"NIHMS817051"}\hlstd{)}
\hlstd{datalist1}
\end{alltt}
\begin{verbatim}
## [1] "NIHMS817051-supplement-Supplemental_Table_1.csv" "NIHMS817051-supplement-Supplemental_Table_1.xlsx" "NIHMS817051-supplement-Supplemental_Table_10.csv"
## [4] "NIHMS817051-supplement-Supplemental_Table_10.xlsx" "NIHMS817051-supplement-Supplemental_Table_11.xlsx" "NIHMS817051-supplement-Supplemental_Table_12.xlsx"
## [7] "NIHMS817051-supplement-Supplemental_Table_2.xlsx" "NIHMS817051-supplement-Supplemental_Table_3.csv" "NIHMS817051-supplement-Supplemental_Table_3.xlsx"
## [10] "NIHMS817051-supplement-Supplemental_Table_4.xlsx" "NIHMS817051-supplement-Supplemental_Table_5.xlsx" "NIHMS817051-supplement-Supplemental_Table_6.xlsx"
## [13] "NIHMS817051-supplement-Supplemental_Table_7.xlsx" "NIHMS817051-supplement-Supplemental_Table_8.xlsx" "NIHMS817051-supplement-Supplemental_Table_9.xlsx"
\end{verbatim}
\end{kframe}
\end{knitrout}
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{tab1}\hlkwb{<-}\hlkwd{read.table}\hlstd{(}\hlkwd{paste0}\hlstd{(data, datalist1[}\hlnum{1}\hlstd{]),} \hlkwc{skip}\hlstd{=}\hlnum{2}\hlstd{,} \hlkwc{sep}\hlstd{=}\hlstr{"\textbackslash{}t"}\hlstd{,} \hlkwc{fill}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{header}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{comment.char} \hlstd{=} \hlstr{""}\hlstd{)}
\hlkwd{head}\hlstd{(tab1)}
\end{alltt}
\begin{verbatim}
## Species ATCC.number Strain.designation Isolation ORF.prefix Accession Version BioSample BioProject
## 1 L. adelaidensis ATCC49625 ATCC49625 Water in cooling tower (Adelaide, Australia) Lade LNKA00000000 LNKA01000000 SAMN03842654 PRJNA285910
## 2 L. anisa ATCC35292 WA-316-C3 Tap water (Los Angeles, CA, USA) Lani LNXS00000000  LNXS01000000  SAMN04274764 PRJNA285910
## 3 L. birminghamensis ATCC43702 CDC#1407-AL-14 Lung biopsy (Alabama, USA) Lbir LNXT00000000  LNXT01000000  SAMN04274765 PRJNA285910
## 4 L. bozemanii ATCC33217 WIGA Lung tissue (Key West, FL, USA) Lboz LNXU00000000  LNXU01000000  SAMN04274766 PRJNA285910
## 5 L. brunensis ATCC43878 ATCC43878 Cooling tower water (Czechoslovakia) Lbru LNXV00000000  LNXV01000000  SAMN04274767 PRJNA285910
## 6 L. cherrii ATCC35252 ORW Thermally altered water (Minnesota, USA) Lche LNXW00000000  LNXW01000000  SAMN04274768 PRJNA285910
\end{verbatim}
\end{kframe}
\end{knitrout}
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{tabPRJNA}\hlkwb{<-}\hlkwd{read.table}\hlstd{(}\hlkwd{paste0}\hlstd{(data,} \hlstr{"PRJNA285910_AssemblyDetails.txt"}\hlstd{),} \hlkwc{header}\hlstd{=}\hlnum{FALSE}\hlstd{,} \hlkwc{sep}\hlstd{=}\hlstr{"\textbackslash{}t"}\hlstd{,} \hlkwc{skip}\hlstd{=}\hlnum{2}\hlstd{,} \hlkwc{fill}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{comment.char} \hlstd{=} \hlstr{""}\hlstd{)}
\hlstd{tabPRJNA}\hlkwb{<-}\hlstd{tabPRJNA[,}\hlnum{1}\hlopt{:}\hlnum{6}\hlstd{]}
\hlkwd{names}\hlstd{(tabPRJNA)}\hlkwb{<-}\hlkwd{c}\hlstd{(}\hlstr{"Assembly"}\hlstd{,} \hlstr{"Level"}\hlstd{,} \hlstr{"WGS"}\hlstd{,} \hlstr{"BioSample"}\hlstd{,} \hlstr{"Strain"}\hlstd{,} \hlstr{"Taxonomy"}\hlstd{)}
\end{alltt}
\end{kframe}
\end{knitrout}
\url{https://pubmed.ncbi.nlm.nih.gov/33881638/}
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{datalist2}\hlkwb{<-}\hlkwd{list.files}\hlstd{(data,} \hlkwc{pattern}\hlstd{=}\hlstr{"Gupta"}\hlstd{)}
\hlstd{datalist2}
\hlstd{gupta1}\hlkwb{<-}\hlkwd{read.table}\hlstd{(}\hlkwd{paste0}\hlstd{(data, datalist2[}\hlnum{1}\hlstd{]),} \hlkwc{sep}\hlstd{=}\hlstr{"\textbackslash{}t"}\hlstd{,} \hlkwc{fill}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{header}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{comment.char} \hlstd{=} \hlstr{""}\hlstd{)}
\hlkwd{head}\hlstd{(gupta1)}
\hlstd{gupta2}\hlkwb{<-}\hlkwd{read.table}\hlstd{(}\hlkwd{paste0}\hlstd{(data, datalist2[}\hlnum{2}\hlstd{]),} \hlkwc{sep}\hlstd{=}\hlstr{"\textbackslash{}t"}\hlstd{,} \hlkwc{fill}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{header}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{comment.char} \hlstd{=} \hlstr{""}\hlstd{)}
\hlkwd{head}\hlstd{(gupta2)}
\hlkwd{head}\hlstd{(tab1)}
\end{alltt}
\end{kframe}
\end{knitrout}
Which species are present in gupta not in the previous datasets.
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{species}\hlkwb{<-}\hlkwd{sub}\hlstd{(tab1}\hlopt{$}\hlstd{Species,} \hlkwc{pattern} \hlstd{=} \hlstr{"L. "}\hlstd{,} \hlkwc{replacement} \hlstd{=} \hlstr{""}\hlstd{)}
\hlstd{test}\hlkwb{<-}\hlkwd{sapply}\hlstd{(gupta1}\hlopt{$}\hlstd{Species,} \hlkwa{function}\hlstd{(}\hlkwc{x}\hlstd{)\{}
\hlstd{is.there}\hlkwb{<-}\hlkwd{sum}\hlstd{(}\hlkwd{sapply}\hlstd{(species,} \hlkwa{function}\hlstd{(}\hlkwc{y}\hlstd{)} \hlkwd{grepl}\hlstd{(x,}\hlkwc{pattern} \hlstd{= y)))}
\hlkwa{if} \hlstd{(is.there}\hlopt{>}\hlnum{0}\hlstd{)\{}
\hlkwd{return}\hlstd{(}\hlnum{TRUE}\hlstd{)}
\hlstd{\}}\hlkwa{else}\hlstd{\{}
\hlkwd{return}\hlstd{(}\hlnum{FALSE}\hlstd{)}
\hlstd{\}}
\hlstd{\})}
\hlcom{# list of species I need to find assembly for:}
\hlstd{list}\hlkwb{<-}\hlstd{gupta1}\hlopt{$}\hlstd{Species[test}\hlopt{==}\hlnum{FALSE}\hlstd{]}
\hlcom{## Keep only legio and coxiella}
\hlstd{list}\hlkwb{<-}\hlkwd{c}\hlstd{(list[}\hlkwd{grep}\hlstd{(list,} \hlkwc{pattern}\hlstd{=}\hlstr{"Legionella"}\hlstd{)], list[}\hlkwd{grep}\hlstd{(list,} \hlkwc{pattern}\hlstd{=}\hlstr{"Coxiella"}\hlstd{)])}
\end{alltt}
\end{kframe}
\end{knitrout}
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{ncbi}\hlkwb{<-}\hlkwd{read.table}\hlstd{(}\hlkwd{paste0}\hlstd{(data,} \hlstr{"tab_ncbi_file.csv"}\hlstd{),} \hlkwc{sep}\hlstd{=}\hlstr{";"}\hlstd{,} \hlkwc{fill}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{header}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{comment.char} \hlstd{=} \hlstr{""}\hlstd{)}
\hlstd{sp}\hlkwb{<-}\hlkwd{c}\hlstd{(}\hlstr{"adelaidensis"}\hlstd{,} \hlstr{"birminghamensis"}\hlstd{,} \hlstr{"brunensis"}\hlstd{,} \hlstr{"cherrii"}\hlstd{)}
\hlstd{ncbi2}\hlkwb{<-}\hlkwd{read.table}\hlstd{(}\hlkwd{paste0}\hlstd{(data,} \hlstr{"tab_ncbi_contigs_parsed.csv"}\hlstd{),} \hlkwc{sep}\hlstd{=}\hlstr{";"}\hlstd{,} \hlkwc{fill}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{header}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{comment.char} \hlstd{=} \hlstr{""}\hlstd{)}
\end{alltt}
\end{kframe}
\end{knitrout}
\section{Get genes sequences}
\subsection{Get sequences from Burstein et al.}
\subsection{Get 78 sequences from Gupta species}
\subsection{Get legionella pneumophila strains sequences}
\section{Phylogeny}
\subsection{Genes alignement}
\subsection{Concatenate}
\subsection{Supertree}
\end{document}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment