Commit ba09c3de authored by mcariou's avatar mcariou
Browse files

update doc

parent 3b0f558d
28+35+30
230+35+30
32+35+30
setwd("Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/phylolegio/doc/")
list.files(path = paste0(home, "/genes"))
home<-"~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/"
list.files(path = paste0(home, "/genes"))
list.files(path = paste0(home, "/genes/78genes"))
list.files(path = paste0(home, "/genes/78genes"))
list.files(path = paste0(home, "/genes/78genes/prot_penumo78"))
list.files(path = paste0(home, "/genes/78genes/prot_pneumo78"))
library(knitr)
knit2pdf("1_reference_legio_phylo.Rnw")
knit2pdf("1_reference_legio_phylo.Rnw")
......@@ -57,7 +57,7 @@ We chose to use the 78 genes selected by Burstein et al. 2016. The first objecti
\subsection{Data}
Data from the first publication:
\subsubsection{Data from the first publication:}
\url{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5050043/}
\url{https://www.ncbi.nlm.nih.gov/bioproject/PRJNA285910}
......@@ -83,17 +83,21 @@ names(tabPRJNA)<-c("Assembly", "Level", "WGS", "BioSample", "Strain", "Taxonomy"
tabAss<-tabPRJNA[, c("Assembly","Taxonomy")]
tabAss$Origin<-"Burstein"
head(tabAss)
tabAss$Taxonomy
@
We keep only legionella, not \textit{Fluoribacter dumoffii} NY 23, \textit{Fluoribacter gormanii} and \textit{Tatlockia micdadei}.
Data from the seconde publication:
<<>>=
tabAss<-tabAss[grep("Legionella", tabAss$Taxonomy),]
@
\subsubsection{Data from the seconde publication:}
\url{https://pubmed.ncbi.nlm.nih.gov/33881638/}
<<eval=FALSE>>=
<<eval=TRUE>>=
datalist2<-list.files(data, pattern="Gupta")
datalist2
......@@ -102,47 +106,113 @@ gupta1<-read.table(paste0(data, datalist2[1]), sep="\t", fill=TRUE, header=TRUE,
head(gupta1)
gupta2<-read.table(paste0(data, datalist2[2]), sep="\t", fill=TRUE, header=TRUE, comment.char = "")
head(gupta2)
head(tab1)
#gupta2<-read.table(paste0(data, datalist2[2]), sep="\t", fill=TRUE, header=TRUE, comment.char = "")
#head(gupta2)
@
Which species are present in gupta not in the previous datasets.
<<>>=
species<-sub(tab1$Species, pattern = "L. ", replacement = "")
#species<-sub(tab1$Species, pattern = "L. ", replacement = "")
species<-tabAss$Taxonomy
# Species liste in Burstein
species
length(species)
test<-sapply(gupta1$Species, function(x){
is.there<-sum(sapply(species, function(y) grepl(x,pattern = y)))
if (is.there>0){
return(TRUE)
}else{
return(FALSE)
}
})
# In common
gupta1$Species[gupta1$Species %in% species]
# Only in Gupta
gupta1$Species[(gupta1$Species %in% species)==FALSE]
# only in Burstein
species[(species %in% gupta1$Species)==FALSE]
@
Legionella drozanskii LLAP-1 and Legionella shakespearei DSM 23087 are also present in gupta but no strain specified
~
Final list: Remove Legionella drozanskii and Legionella shakespearei and Aquicella, Diplorickettsia, Rickettsiella, Candidatus, Burkholderia
# list of species I need to find assembly for:
list<-gupta1$Species[test==FALSE]
<<>>=
guptasp<-gupta1$Species[(gupta1$Species %in% species)==FALSE]
guptasp<-guptasp[guptasp %in% c("Legionella drozanskii", "Legionella shakespearei")==F]
guptasp[grep(guptasp, pattern="Legionella")]
## Keep only legio and coxiella
list<-c(list[grep(list, pattern="Legionella")], list[grep(list, pattern="Coxiella")])
guptasp<-c(as.character(guptasp[grep(guptasp, pattern="Legionella")]),
as.character(guptasp[grep(guptasp, pattern="Coxiella burnetii")]))
## remove pneumophila.
guptasp<-guptasp[guptasp!="Legionella pneumophila"]
## remove \textit{Fluoribacter dumoffii} NY 23, \textit{Fluoribacter gormanii} and \textit{Tatlockia micdadei}.
guptasp<-guptasp[guptasp!="Legionella dumoffii NY23"]
guptasp<-guptasp[guptasp!="Legionella gormanii"]
guptasp<-guptasp[guptasp!="Legionella micdadei"]
@
\textit{Fluoribacter dumoffii} NY 23, \textit{Fluoribacter gormanii} and \textit{Tatlockia micdadei}.
Manual research of reference genomes using gupta1
<<>>=
guptaonly<-gupta1[gupta1$Species %in% guptasp,]
guptaonly$Assembly<-""
guptaonly[,c(1,5,6)]
guptaonly$Assembly[guptaonly$Species=="Legionella beliardensis"]<-"GCA_900452395.1"
guptaonly$Assembly[guptaonly$Species=="Legionella clemsonensis"]<-"GCA_002240035.1"
guptaonly$Assembly[guptaonly$Species=="Legionella drancourtii"]<-"GCA_000162755.2"
guptaonly$Assembly[guptaonly$Species=="Legionella endosymbiont of Polyplax serrata PsAG"]<-"GCA_002776555.1"
guptaonly$Assembly[guptaonly$Species=="Legionella fairfieldensis"]<-"GCA_000621525.1"
guptaonly$Assembly[guptaonly$Species=="Legionella fallonii"]<-"GCA_000953135.1"
guptaonly$Assembly[guptaonly$Species=="Legionella impletisoli"]<-"GCA_900639875.1"
guptaonly$Assembly[guptaonly$Species=="Legionella longbeachae"]<-"GCA_000091785.1"
guptaonly$Assembly[guptaonly$Species=="Legionella massiliensis"]<-"GCA_000756815.1"
guptaonly$Assembly[guptaonly$Species=="Legionella nagasakiensis"]<-"GCA_900639915.1"
guptaonly$Assembly[guptaonly$Species=="Legionella norrlandica"]<-"GCA_000770585.1"
guptaonly$Assembly[guptaonly$Species=="Legionella rowbothamii"]<-"GCA_900639985.1"
guptaonly$Assembly[guptaonly$Species=="Legionella saoudiensis"]<-"GCA_001465875.1"
guptaonly$Assembly[guptaonly$Species=="Legionella taurinensis"]<-"GCA_003070665.1"
guptaonly$Assembly[guptaonly$Species=="Legionella tunisiensis"]<-"GCA_000308315.1"
guptaonly$Assembly[guptaonly$Species=="Legionella wadsworthii"]<-"GCA_000701265.1"
guptaonly$Assembly[guptaonly$Species=="Legionella yabuuchiae"]<-"GCA_900640115.1"
guptaonly$Assembly[guptaonly$Species=="Coxiella burnetii"]<-"GCA_001572745.1"
@
<<>>=
tabAsstmp<-cbind(guptaonly$Assembly, as.character(guptaonly$Species),
rep("gupta", nrow(guptaonly)))
tabAsstmp<-as.data.frame(tabAsstmp)
names(tabAsstmp)<-names(tabAss)
tabAss<-rbind(tabAss, tabAsstmp)
@
<<eval=FALSE>>=
ncbi<-read.table(paste0(data, "tab_ncbi_file.csv"), sep=";", fill=TRUE, header=TRUE, comment.char = "")
sp<-c("adelaidensis", "birminghamensis", "brunensis", "cherrii")
ncbi2<-read.table(paste0(data, "tab_ncbi_contigs_parsed.csv"), sep=";", fill=TRUE, header=TRUE, comment.char = "")
@
Data from Legionella pneumophila reference strains:
\subsubsection{Data from Legionella pneumophila reference strains:}
\begin{itemize}
\item Legionella pneumophila Paris complete genome : CR628336.1 - GCA\_000048645.1
......@@ -188,14 +258,42 @@ tabAss<-rbind(tabAss, cbind(Taxonomy, Assembly, Origin))
@
\section{Get genes sequences}
\subsection{Get sequences from Burstein et al.}
\section{Get genes sequences}
<<>>=
tab10<-read.table(paste0(data, datalist1[3]), skip=2, sep="\t", fill=TRUE, header=TRUE, comment.char = "")
tab10<-tab10[1:63380,]
tab10$sp<-sapply(as.character(tab10$ORF), function(x) strsplit(x, "_")[[1]][1])
@
I retrieced the sequences from uniprot using the column G (L. pneumophila id) from table sup 3 from Brustein
I thus retrieved 78 proteic sequences. Let see if I can retreive homolgs by BLAST for all individuals.
I will try with PSI-BLAST.
<<>>=
list.files(path = paste0(home, "/genes/78genes/prot_pneumo78"))
@
For the other species, use of PSI BLAST.
\textit{When using PSI-BLAST the results of a normal BLAST search are aligned and used to construct a pattern of conserved residues. This pattern is used for the next round of searching instead of the original query sequence. The process is repeated (iterated) until a final database search finds no more related sequences. When the process ends in this fashion, it is said to have converged.}
\textbf{TO DO:}
\begin{itemize}
\item install PSI BLAST
\item Use PSI-BLAST
\item Find way to filter within DB.
\item Or make a sub db?
\end{itemize}
\subsection{Get sequences from Burstein et al.}
\subsection{Get 78 sequences from Gupta species}
......
This diff is collapsed.
This diff is collapsed.
file species size ncontig comr rocc
942 GCA_900452395.1_50618_B01_genomic Legionella beliardensis 3.5 6 FALSE FALSE
996 GCA_900639825.1_Leg_beliardensis_Wilkinson_1407-AL-H.v1_genomic Legionella beliardensis 3.4 31 FALSE FALSE
271 GCA_002240035.1_ASM224003v1_genomic Legionella clemsonensis 3.2 1 FALSE FALSE
7 GCA_000162755.2_ASM16275v2_genomic Legionella drancourtii 4.1 58 TRUE FALSE
48 GCA_000621525.1_ASM62152v1_genomic Legionella fairfieldensis 2.6 57 FALSE FALSE
1023 GCA_900640125.1_Leg_fairfieldensis_1725-Aus-E.v1_genomic Legionella fairfieldensis 2.6 76 FALSE FALSE
93 GCA_000953135.1_LFA_genomic Legionella fallonii 4.3 3 TRUE FALSE
1001 GCA_900639875.1_Leg_impletisoli_OA1-1.v1_genomic Legionella impletisoli 2.5 20 FALSE FALSE
4 GCA_000091785.1_ASM9178v1_genomic Legionella longbeachae 4.1 2 FALSE TRUE
8 GCA_000176095.1_ASM17609v1_genomic Legionella longbeachae 4 13 FALSE TRUE
267 GCA_002073455.2_ASM207345v2_genomic Legionella longbeachae 4.1 1 FALSE TRUE
270 GCA_002113845.3_ASM211384v3_genomic Legionella longbeachae 4.2 2 FALSE TRUE
439 GCA_004283175.1_ASM428317v1_genomic Legionella longbeachae 4 38 FALSE TRUE
563 GCA_008807315.1_ASM880731v1_genomic Legionella longbeachae 4.3 2 FALSE TRUE
580 GCA_011465255.1_ASM1146525v1_genomic Legionella longbeachae 4.1 3 FALSE TRUE
581 GCA_011465395.1_ASM1146539v1_genomic Legionella longbeachae 4 2 FALSE TRUE
971 GCA_900461575.1_42650_G01_genomic Legionella longbeachae 2.7 2 FALSE FALSE
56 GCA_000756695.1_PRJEB110_assembly_1_genomic Legionella massiliensis 4.3 8 FALSE FALSE
57 GCA_000756815.1_PRJEB6598_assembly_1_genomic Legionella massiliensis 4.3 8 FALSE FALSE
1004 GCA_900639915.1_Leg_nagasakiensis_JCM_15315.v1_genomic Legionella nagasakiensis 2.7 54 FALSE FALSE
58 GCA_000770585.1_ASM77058v1_genomic Legionella norrlandica 3 157 TRUE TRUE
1011 GCA_900639985.1_Leg_rowbothamii_LLAP6.v1_genomic Legionella rowbothamii 3.9 35 FALSE FALSE
307 GCA_003070625.1_ASM307062v1_genomic Legionella taurinensis 3 46 FALSE FALSE
308 GCA_003070645.1_ASM307064v1_genomic Legionella taurinensis 3 46 FALSE FALSE
309 GCA_003070665.1_ASM307066v1_genomic Legionella taurinensis 3 43 FALSE FALSE
310 GCA_003070675.1_ASM307067v1_genomic Legionella taurinensis 3 61 FALSE FALSE
325 GCA_003602125.1_ASM360212v1_genomic Legionella taurinensis 3.2 118 FALSE FALSE
326 GCA_003602175.1_ASM360217v1_genomic Legionella taurinensis 3.3 97 FALSE FALSE
445 GCA_004920375.1_ASM492037v1_genomic Legionella taurinensis 3 89 FALSE FALSE
446 GCA_004920385.1_ASM492038v1_genomic Legionella taurinensis 3 44 FALSE FALSE
448 GCA_004920415.1_ASM492041v1_genomic Legionella taurinensis 3 61 FALSE FALSE
449 GCA_004920475.1_ASM492047v1_genomic Legionella taurinensis 3 46 FALSE FALSE
450 GCA_004920485.1_ASM492048v1_genomic Legionella taurinensis 3 44 FALSE FALSE
451 GCA_004920495.1_ASM492049v1_genomic Legionella taurinensis 3 44 FALSE FALSE
509 GCA_004921725.1_ASM492172v1_genomic Legionella taurinensis 3 46 FALSE FALSE
510 GCA_004921735.1_ASM492173v1_genomic Legionella taurinensis 3 46 FALSE FALSE
963 GCA_900452865.1_50618_H01_genomic Legionella taurinensis 3.1 2 FALSE FALSE
1019 GCA_900640065.1_Leg_taurinensis_Turin_I.v1_genomic Legionella taurinensis 3 41 FALSE FALSE
54 GCA_000701265.1_ASM70126v1_genomic Legionella wadsworthii 3.5 16 FALSE FALSE
964 GCA_900452925.1_42650_E02_genomic Legionella wadsworthii 3.5 2 FALSE FALSE
1017 GCA_900640045.1_Leg_wadsworthii_81-716.v1_genomic Legionella wadsworthii 3.5 24 FALSE FALSE
1022 GCA_900640115.1_Leg_yabuuchiae_OA1-2.v1_genomic Legionella yabuuchiae 2.6 89 FALSE FALSE
#!/bin/bash
echo "USAGE: ./3_make_blast.sh \$1=blast_db \$2=gene_query \$3=out_path"
echo "--------------------------------------------------"
##################################################################################################################
### En local
#./make_blast.sh /home/adminmarie/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/blastdb/cdslegio /home/adminmarie/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/genes/lp0952.fasta /home/adminmarie/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/out_blastn
### PSMN
#./make_blast.sh /Xnfs/ciridb/shared_data/Databases/legio_cds/blastdb_ncbi/cdslegio ~/2021_legio/genes/lp0952.fasta ~/2021_legio/out_blastn
##################################################################################################################
# variable
REP_DB=$1
gene=`echo $2 | sed 's/.fasta//g'`
REP_OUT=$3
gene_name=`echo $gene | awk -F "/" '{print $NF}'`
# Check argument
if [[ -s $gene.fasta ]] ; then
echo "Gene exists:$gene"
else
echo "Missing or wrong gene argument: $2"
exit 1
fi ;
mkdir -p $REP_OUT
if [[ -s $REP_OUT ]] ; then
echo "Out repertory exists: $REP_OUT"
else
echo "Missing or wrong out repertory argument: $3"
exit 1
fi ;
echo "--------------------------------------------------"
echo "make db if not available:"
if [[ -s $REP_DB.nal ]] ; then
echo "$REP_DB.nal exists"
else
echo "$REP_DB.nal doesn't exist"
origdb=`echo x{00..31}`
echo $origdb
blastdb_aliastool -dbtype nucl -dblist "$origdb" -out $REP_DB -title "blastdb with ncbi legionella cds"
fi ;
echo "--------------------------------------------------"
blastn -db $REP_DB -query ${gene}.fasta -evalue 10 \
-outfmt "7 qseqid qlen sseqid slen length pident evalue nident mismatch gapopen qstart qend qseq bitscrore score sstart send sseq sstrand" \
-out $REP_OUT/${gene_name}.blastn -max_target_seqs 5000
echo "program done!"
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment