Commit 33a55205 authored by mcariou's avatar mcariou
Browse files

add phylo step

parent 41005456
This diff is collapsed.
......@@ -17,7 +17,29 @@ Create a phylogenetic pipeline for legionella genes.
sudo apt-get update -y
sudo apt-get install -y efetch
sudo apt install ncbi-entrez-direct
```
-aligner (prank?)
```
sudo apt install prank
```
-trimal
```
git clone https://github.com/scapella/trimal.git
cd trimal/source
make
cp trimal /home/adminmarie/bin/
```
-phyml
```
git clone https://github.com/stephaneguindon/phyml.git
cd phyml
./autogen.sh
./configure --enable-phyml
make
cp src/phyml /home/adminmarie/bin/
```
......@@ -52,11 +74,26 @@ seuil de pvalue? + couverture?
```
~/script/4_parse_blast.sh
~/script/4_parse_blast.sh ~/out_blastn/lp0952.blastn ~/out_blastn/ lp0952_sp.blastn ~/fasta/ lp0952_ortho 0.0001 0.3 0.5 10
USAGE: ./4_parse_blast.sh $1=blast_out$2=modified_blast_path
$3=modified_blast_name
$4=fasta_path $5=fasta_name
$6=seuil_evalue $7=percid $8=percoverlap
$9=maxseqpertax
```
This script take as input a blastn output and filter criteria, get species names using efetch and run a R script to filter the blast output, keep a given number of sequences per unique species names. It return a report od the number of hit blast at each steps and a fasta output.
*problem: I get a roganism name from efetch, wich is not necessarily a standardize species name: numerous names including strain names/number. I keep max 10 sequences for sequences that have the exact same name, but I keep small variations such as "Legionella pneumophila strain zorg" "Legionella_pneumophila_subsp._pneumophila_LPE509" etc. These are difficult to parse because no common feature. To be discussed with team. Results could be improved using the 2 first word of each name*
## 5. Align selected sequences and make phylogeny
```
~/script/5_aln_phy.sh ~/fasta/ lp0952_ortho
```
run prank, trimal and phyml on the fasta output from step 4.
......
......@@ -52,9 +52,16 @@ blastn<-blastn[blastn$nident/blastn$qlen>=overlap,]
print(paste("Number of hits after perc id filter: ", nrow(blastn)))
blastn<-blastn[blastn$species!="",]
print(paste("Number of hits after unidentified removal: ", nrow(blastn)))
## keep best hit for species with multiple hit
print(paste("species with more than" nind, "hits"))
print(paste("species with more than", nind, "hits"))
table(blastn$species)[table(blastn$species)>nind]
......
......@@ -13,7 +13,7 @@ echo "--------------------------------------------------"
#~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/out_blastn/ lp0952_sp.blastn \
#~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/fasta/ lp0952_ortho
#0.0001 0.3 0.5 10
#./4_parse_blast.sh ~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/out_blastn/lp0952.blastn ~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/out_blastn/lp0952_sp.blastn ~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/fasta/ lp0952_ortho 0.0001 0.3 0.5 10
#./4_parse_blast.sh ~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/out_blastn/lp0952.blastn ~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/out_blastn/ lp0952_sp.blastn ~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/fasta/ lp0952_ortho 0.0001 0.3 0.5 10
### PSMN
#./
......@@ -41,22 +41,30 @@ nrow=`wc -l $OUT_BLAST.tmp | awk '{print $1}'`
echo $nrow
echo "debut de la boucle:"
for id in `seq $nrow`
do
line=`sed -n ${id}p $OUT_BLAST.tmp`
idncbi=`sed -n ${id}p $OUT_BLAST.tmp | awk '{print $3}' | awk -F[\.] '{print $1}'` #### erreur sur cette ligne, il faut selectionne la ligne $id
tax=`efetch -db nuccore -id $idncbi -format docsum | xtract -pattern DocumentSummary -element Organism | sed 's/ /_/g'`
echo $tax
echo $line " " $tax >> $MOD_BLAST
done
#for id in `seq $nrow`
#do
#line=`sed -n ${id}p $OUT_BLAST.tmp`
#idncbi=`sed -n ${id}p $OUT_BLAST.tmp | awk '{print $3}' | awk -F[\.] '{print $1}'` #### erreur sur cette ligne, il faut selectionne la ligne $id
#tax=`efetch -db nuccore -id $idncbi -format docsum | xtract -pattern DocumentSummary -element Organism | sed 's/ /_/g'`
#echo $tax
#echo $line " " $tax >> $MOD_BLAST
#done
wc -l $OUT_BLAST
wc -l $MOD_BLAST
rm $OUT_BLAST.tmp
## This is to get the path to the R script from the sh script
## This assume they are in the same repertory
#Rscript --vanilla 4_paste_blast.R $MOD_BLAST $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA> $2/paste_blast.log
echo $0
rscript="."`ls $0 | cut -f 2 -d "."`".R"
echo $rscript
Rscript --vanilla $rscript $MOD_BLAST $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA> $2/paste_blast.log
......
#!/bin/bash
echo "USAGE: ./5_aln_phy.sh \$1=fasta_path \$2=fasta_names(wo extension)"
echo "--------------------------------------------------"
##################################################################################################################
### En local
#./5_aln_phy.sh ~/Documents/CIRI_BIBS_projects/2021_04_Doublet/pipeline/fasta/ lp0952_ortho
### PSMN
#./
##################################################################################################################
# variable
OUT=$1
FASTA=$1/$2.fasta
ALN=$1/$2
PHYLIP=$1/$2_aln.phylip
if [ -e $FASTA ] ; then
echo $FASTA" exists"
#prank -d=$FASTA +F -o=$ALN
echo $ALN
## Convert to phylipX
trimal -in $ALN.best.fas -out $PHYLIP -phylip
## phylogeny
phyml -i $PHYLIP -d nt -m HKY85 -a e -c 4 -s NNI -b -1
else
echo $FASTA "do not exists. incorrect input"
fi ;
# fin
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment