Commit 321a98f2 authored by mcariou's avatar mcariou
Browse files

modify parse blast

parent c2a64c6c
\relax
\providecommand\hyper@newdestlabel[2]{}
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
\global\let\oldcontentsline\contentsline
\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
\global\let\oldnewlabel\newlabel
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
\AtEndDocument{\ifx\hyper@anchor\@undefined
\let\contentsline\oldcontentsline
\let\newlabel\oldnewlabel
\fi}
\fi}
\global\let\hyper@last\relax
\gdef\HyperFirstAtBeginDocument#1{#1}
\providecommand\HyField@AuxAddToFields[1]{}
\providecommand\HyField@AuxAddToCoFields[2]{}
\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{2}{section.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.1}Objective}{2}{subsection.1.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.2}Data}{2}{subsection.1.2}}
\@writefile{toc}{\contentsline {subsubsection}{\numberline {1.2.1}Data from the first publication:}{2}{subsubsection.1.2.1}}
\@writefile{toc}{\contentsline {subsubsection}{\numberline {1.2.2}Data from the seconde publication:}{3}{subsubsection.1.2.2}}
\@writefile{toc}{\contentsline {subsubsection}{\numberline {1.2.3}Data from Legionella pneumophila reference strains:}{8}{subsubsection.1.2.3}}
\@writefile{toc}{\contentsline {section}{\numberline {2}Get genes sequences}{9}{section.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Get sequences from Burstein et al.}{9}{subsection.2.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Get 78 sequences from Gupta species}{10}{subsection.2.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Get legionella pneumophila strains sequences}{10}{subsection.2.3}}
\@writefile{toc}{\contentsline {section}{\numberline {3}Phylogeny}{10}{section.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Genes alignement}{10}{subsection.3.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Concatenate}{10}{subsection.3.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Supertree}{10}{subsection.3.3}}
\BOOKMARK [1][-]{section.1}{Introduction}{}% 1
\BOOKMARK [2][-]{subsection.1.1}{Objective}{section.1}% 2
\BOOKMARK [2][-]{subsection.1.2}{Data}{section.1}% 3
\BOOKMARK [3][-]{subsubsection.1.2.1}{Data from the first publication:}{subsection.1.2}% 4
\BOOKMARK [3][-]{subsubsection.1.2.2}{Data from the seconde publication:}{subsection.1.2}% 5
\BOOKMARK [3][-]{subsubsection.1.2.3}{Data from Legionella pneumophila reference strains:}{subsection.1.2}% 6
\BOOKMARK [1][-]{section.2}{Get genes sequences}{}% 7
\BOOKMARK [2][-]{subsection.2.1}{Get sequences from Burstein et al.}{section.2}% 8
\BOOKMARK [2][-]{subsection.2.2}{Get 78 sequences from Gupta species}{section.2}% 9
\BOOKMARK [2][-]{subsection.2.3}{Get legionella pneumophila strains sequences}{section.2}% 10
\BOOKMARK [1][-]{section.3}{Phylogeny}{}% 11
\BOOKMARK [2][-]{subsection.3.1}{Genes alignement}{section.3}% 12
\BOOKMARK [2][-]{subsection.3.2}{Concatenate}{section.3}% 13
\BOOKMARK [2][-]{subsection.3.3}{Supertree}{section.3}% 14
\contentsline {section}{\numberline {1}Introduction}{2}{section.1}
\contentsline {subsection}{\numberline {1.1}Objective}{2}{subsection.1.1}
\contentsline {subsection}{\numberline {1.2}Data}{2}{subsection.1.2}
\contentsline {subsubsection}{\numberline {1.2.1}Data from the first publication:}{2}{subsubsection.1.2.1}
\contentsline {subsubsection}{\numberline {1.2.2}Data from the seconde publication:}{3}{subsubsection.1.2.2}
\contentsline {subsubsection}{\numberline {1.2.3}Data from Legionella pneumophila reference strains:}{8}{subsubsection.1.2.3}
\contentsline {section}{\numberline {2}Get genes sequences}{9}{section.2}
\contentsline {subsection}{\numberline {2.1}Get sequences from Burstein et al.}{9}{subsection.2.1}
\contentsline {subsection}{\numberline {2.2}Get 78 sequences from Gupta species}{10}{subsection.2.2}
\contentsline {subsection}{\numberline {2.3}Get legionella pneumophila strains sequences}{10}{subsection.2.3}
\contentsline {section}{\numberline {3}Phylogeny}{10}{section.3}
\contentsline {subsection}{\numberline {3.1}Genes alignement}{10}{subsection.3.1}
\contentsline {subsection}{\numberline {3.2}Concatenate}{10}{subsection.3.2}
\contentsline {subsection}{\numberline {3.3}Supertree}{10}{subsection.3.3}
This diff is collapsed.
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
# test if there is at least one argument: if not, return an error
if (length(args)!=6) {
stop("5 arguments must be supplied.\n", call.=FALSE)
}
#args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast.mod", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
file<-args[1]
eval=as.numeric(args[2])
percid=as.numeric(args[3])
overlap=as.numeric(args[4])
nind=as.numeric(args[5])
fasta=args[6]
blastn<-read.table(file, h=F, fill=T)
names(blastn)<-c("qseqid", "qlen", "sseqid", "slen", "length",
"pident","evalue","nident","mismatch", "gapopen",
"qstart", "qend", "qseq", "score", "sstart",
"send", "sseq", "sstrand", "species")
### report
print(paste("Number of total hit: ", nrow(blastn)))
print(paste("Number of query:", length(unique(blastn$qseqid))))
print("Species range (row):")
print(table(blastn$species))
### Filters
blastn<-blastn[blastn$evalue<=eval,]
print(paste("Number of hits after evalue filter: ", nrow(blastn)))
blastn<-blastn[blastn$pident>=percid*100,]
print(paste("Number of hits after perc id filter: ", nrow(blastn)))
blastn<-blastn[blastn$nident/blastn$qlen>=overlap,]
print(paste("Number of hits after perc id filter: ", nrow(blastn)))
#blastn<-blastn[blastn$species!="",]
print(paste("Number of hits after unidentified removal: ", nrow(blastn)))
#### Garder 1 séquence par espèce
species<-"Legionella_pneumophila_subsp._pneumophila_str._Philadelphia_1"
selecthit<-function(species){
tmp<-blastn[blastn$specie==species,]
tmp[order(tmp$evalue),][order(tmp$length),]
return(tmp$sseqid[1])
}
listid<-sapply(unique(blastn$species)[unique(blastn$species)!=""], function(x) selecthit(x))
listid2<-sapply(as.character(listid), function(x) substr(x, 1, nchar(x)-2))
for (seq in listid2){
cmd<-paste0("blastdbcmd -db ~/2021_legio/blastdb/phyloref/phyloref_nuc -entry ", seq, " >> ", fasta)
system(cmd)
}
#!/bin/bash
echo "USAGE: ./4_parse_blast.sh \$1=blast_out_file
\$2=modified_blast \$3=fasta_out
echo "USAGE: ./4_parse_PSIblast.sh \$1=blast_out_file
\$2=tabgenome \$3=fasta_out_rep
\$4=seuil_evalue \$5=percid \$6=percoverlap
\$7=maxseqpertax"
echo "--------------------------------------------------"
......@@ -16,7 +16,7 @@ echo "--------------------------------------------------"
#./4_parse_blast.sh ~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/out_blastn/lp0952.blastn ~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/out_blastn/lp0952_sp.blastn ~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/fasta/lp0952_ortho.fasta 0.0001 0.3 0.5 10
### PSMN
#./4_parse_blast.sh ~/2021_legio/out_blastn/78Lp_uniprot.psiblast ~/2021_legio/phylolegio/doc/tabAss.txt ~/2021_legio/fasta/78Lp 0.0001 0.3 0.5 3
#./4_parse_blast.sh ~/2021_legio/out_blastn/78Lp_uniprot.psiblast ~/2021_legio/phylolegio/doc/tabAss.txt ~/2021_legio/fasta/78Lp ~/2021_legio/genes/78Lp_uniprot.fasta 0.0001 0.3 0.5 3
##################################################################################################################
# variable
......@@ -25,47 +25,55 @@ TAB_SP=$2
BLAST_REP=`echo $1 | awk -F "/" '{$NF=""; print $0}' | sed 's/ /\//g'`
FASTA_REP=$3
mkdir -p $FASTA_REP
EVAL=$4
PERCID=$5
PERCOVERLAP=$6
NPERTAX=$7
#################################################################################################################
# Read tabAss, split blast output.
GENES=`grep ">" $4 | sed 's/>//g'`
#for id in `seq $nrow`
#do
#line=`sed -n ${id}p $OUT_BLAST.tmp`
#idncbi=`sed -n ${id}p $OUT_BLAST.tmp | awk '{print $3}' | awk -F[\.] '{print $1}'` #### erreur sur cette ligne, il faut selectionne la ligne $id
#tax=`efetch -db nuccore -id $idncbi -format docsum | xtract -pattern DocumentSummary -element Organism | sed 's/ /_/g'`
#echo $tax
#echo $line " " $tax >> $MOD_BLAST
#done
## This is to get the path to the R script from the sh script
## This assume they are in the same repertory
EVAL=$5
PERCID=$6
PERCOVERLAP=$7
NPERTAX=$8
echo $0
rscript=`ls $0 | sed 's/.sh/.R/g'`
echo $rscript
#Rscript --vanilla $rscript $MOD_BLAST $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA> $BLAST_REP/paste_blast.log
#################################################################################################################
# Read fasta query, to write proper aln.
for Gene in $GENES
do
mkdir -p $FASTA_REP/$Gene
echo $Gene
subblast=$FASTA_REP"/"$Gene"/"$Gene".psiblast"
grep $Gene $OUT_BLAST > $subblast
FASTA=$FASTA_REP"/"$Gene"/"$Gene".fasta"
nrow=`cat $subblast | wc -l`
if [[ -s $subblast.mod ]] ; then
echo "sp names already retrieved"
else
echo "debut de la boucle:"
for id in `seq $nrow`
do
line=`sed -n ${id}p $subblast`
idncbi=`sed -n ${id}p $subblast | awk '{print $3}' | awk -F[\.] '{print $1}'`
tax=`efetch -db nuccore -id $idncbi -format docsum | xtract -pattern DocumentSummary -element Organism | sed 's/ /_/g'`
echo $tax
echo $line " " $tax >> $subblast.mod
done
fi ;
if [[ -s $FASTA ]] ; then
echo "sequences already retrieved"
else
echo "retrieve fasta"
Rscript --vanilla $rscript $subblast.mod $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA> $FASTA_REP/$Gene/paste_blast.log
fi ;
done
......
#!/bin/bash
#$ -S /bin/bash
## name of the job to follow them
#$ -N runscript4
## name of the queue to be used
#$ -q E5-2670deb*,E5-2667v2*,E5-2667v4*
#$ -cwd
#$ -V
## where to put the log files (output and error) automatically generated by the cluster (different from the .log generated by DGINN)
## the dirs must exist before job is launched
#$ -o /home/mcariou/2021_legio/log/
#$ -e /home/mcariou/2021_legio/log/
./4_parse_PSIblast.sh ~/2021_legio/out_blastn/78Lp_uniprot.psiblast ~/2021_legio/phylolegio/doc/tabAss.txt ~/2021_legio/fasta/78Lp ~/2021_legio/genes/78Lp_uniprot.fasta 0.0001 0.3 0.5 1
# fin
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment