Commit 78503659 authored by mcariou's avatar mcariou
Browse files

add aln step

parent 33f7ac87
......@@ -185,10 +185,7 @@ Il y a des chemin en dur pour l'appel à taxonomizr + à la base de données.
- Adapt the pipeline to use more than one gene, make concatenates, and format presence/absence for all genes.
=> make script 4_parse_PSIblast.sh (qui coombine PSIBLAST et PSIBLASTTMP)
=> RUNNING
=> problème, ne permet pas de conserver plusieurs souches de pneumophila à l'heure actuelle
=> make concatenate.
......
......@@ -10,6 +10,7 @@ if (length(args)!=8) {
#args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta")
#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.1", "0.1", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta", "~/2021_legio/blastdb/phyloref/phyloref_nuc", "/home/mcariou/2021_legio/Taxo/accessionTaxa.sql")
print("reading arguments and data...")
file<-args[1]
......@@ -101,22 +102,25 @@ system(cmd)
library(ape)
aln<-read.dna(fasta, format="fasta")
aln<-clustal(aln)
alnaln<-clustal(aln)
#names(alnaln)<-names(aln)
names(listid_short)<-gsub(names(listid_short), pattern=" ", replacement="_")
names(aln)<-sapply(names(aln), function(x) strsplit(x, " ")[[1]][1])
tmp<-sapply(rownames(alnaln), function(x) strsplit(x, " ")[[1]][1])
newnames<-sapply(names(aln), function(x){
newnames<-sapply(tmp, function(x){
names(listid_short)[listid_short==x]
})
names(aln)<-newnames
rownames(alnaln)<-newnames
fasta_aln<-gsub(fasta, pattern=".fasta", replacement="_aln.fasta", fixed=TRUE)
write.dna(aln, file=fasta_aln, format="fasta", nbcol = 6, colsep = "", colw = 10)
write.dna(alnaln, file=fasta_aln, format="fasta", nbcol = 6, colsep = "", colw = 10)
......
......@@ -24,22 +24,59 @@ cat<-list()
#### fuction
file<-list.files(paste0(args[1], "/", subrep[1]))
# make a cat list containing all species names
for (subrepcurrent in subrep){
print(subrepcurrent)
file<-list.files(paste0(args[1], "/", subrepcurrent))
file<-file[grep(x=file, pattern="aln")]
aln<-read.dna(paste0(args[1], "/",subrep[1], "/", file), format="fasta", as.character=TRUE)
if (length(file)>0){
aln<-read.dna(paste0(args[1], "/",subrepcurrent, "/", file), format="fasta", as.character=TRUE)
tmp<-aln
rownames(tmp)<-sapply(rownames(aln), function(x) strsplit(x, split=".", fixed=TRUE)[[1]][1])
for (seq in rownames(tmp)){
if((seq %in% names(cat))==FALSE){
#print(seq)
cat<-c(cat, list(seq=""))
names(cat)<-c(names(cat)[-length(cat)], seq)
}
}
}
}
# For a new aln, add sequences to all element of cat (row of gap if necessary)
tmp<-aln
names(tmp)<-sapply(labels(aln), function(x) strsplit(x, split=".", fixed=TRUE)[[1]][1])
add_aln<-function(cat, subrepi){
file<-list.files(paste0(args[1], "/", subrepi))
file<-file[grep(x=file, pattern="aln")]
if (length(file)>0){
aln<-read.dna(paste0(args[1], "/",subrepi, "/", file), format="fasta", as.character=TRUE, as.matrix=FALSE)
names(aln)<-sapply(names(aln), function(x) strsplit(x, split=".", fixed=TRUE)[[1]][1])
len<-max(unlist(lapply(aln, length)))
for (seq in names(cat)){
if(seq %in% names(aln)){
cat[[seq]]<-c(cat[[seq]], aln[[seq]])
}else{
gap=rep("-", len)
cat[[seq]]<-c(cat[[seq]], gap)
}
}
}
return(cat)
}
for (seq in names(tmp)){
print(seq)
cat<-c(cat, list(seq=tmp[seq]))
names(cat)<-c(names(cat)[-length(cat)], seq)
for(i in 1:length(subrep)){
print(i)
cat<-add_aln(cat, subrep[i])
}
#write.dna(aln, file=fasta_aln, format="fasta", nbcol = 6, colsep = "", colw = 10)
write.dna(cat, file=paste0(args[1], "/cat.fasta"), format="fasta", nbcol = 6, colsep = "", colw = 10)
......
......@@ -14,8 +14,8 @@ echo "--------------------------------------------------"
# variable
DATA=$1
FASTA=$1/concat.fasta
PHYLIP=$1/concat.phylip
FASTA=$1/cat.fasta
PHYLIP=$1/cat.phylip
echo $0
rscript=`ls $0 | sed 's/.sh/.R/g'`
......
#!/bin/bash
#$ -S /bin/bash
## name of the job to follow them
#$ -N runscript34
#$ -N runscript5
## name of the queue to be used
#$ -q E5-2670deb*,E5-2667v2*,E5-2667v4*
#$ -cwd
......@@ -10,13 +10,16 @@
## the dirs must exist before job is launched
#$ -o /home/mcariou/2021_legio/log/
#$ -e /home/mcariou/2021_legio/log/
module load trimal
#/home/mcariou/2021_legio/phylolegio/script/3_make_PSIblast.sh ~/2021_legio/blastdb/phyloref/phyloref_prot ~/2021_legio/genes/78Lp_uniprot.fasta ~/2021_legio/out_blastn/
## IN TRIAL
/home/mcariou/2021_legio/phylolegio/script/4_parse_PSIblast.sh ~/2021_legio/out_blastn/78Lp_uniprot.psiblast ~/2021_legio/phylolegio/doc/tabAss.txt ~/2021_legio/fasta/78Lp ~/2021_legio/genes/78Lp_uniprot.fasta 0.0001 0.5 0.5 1
#/home/mcariou/2021_legio/phylolegio/script/4_parse_PSIblast.sh ~/2021_legio/out_blastn/78Lp_uniprot.psiblast ~/2021_legio/phylolegio/doc/tabAss.txt ~/2021_legio/fasta/78Lp ~/2021_legio/genes/78Lp_uniprot.fasta 0.0001 0.5 0.5 1
/home/mcariou/2021_legio/phylolegio/script/5_cat_aln_phy.sh ~/2021_legio/fasta/78Lp/
# fin
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment