diff --git a/README.md b/README.md index 178b7f48231096898660952086dff1465ab21d55..5e6a6eca82a29e94e34327dd6699b9f8acb8f819 100755 --- a/README.md +++ b/README.md @@ -176,6 +176,17 @@ run prank, trimal and phyml on the fasta output from step 4. ## 6. To do +16/11/2021 +Les script 4 mache sur lpg2300 et récupère 49 séquences + +--> checker quelles espèces sont présentes et cohérence avec résultats de Margaux + quels sont les espèces manquantes + +--> Faire en sorte que ce script marche aussi sur plusieurs gènes + + + + + - Adapt the pipeline to use more than one gene, make concatenates, and format presence/absence for all genes. => make script 3_run_PSIblast.sh to blast all 78 genes retrieved from uniprot on the specific database designed for phylogeny. diff --git a/script/.4_parse_PSIblastTMP.R.swp b/script/.4_parse_PSIblastTMP.sh.swp similarity index 57% rename from script/.4_parse_PSIblastTMP.R.swp rename to script/.4_parse_PSIblastTMP.sh.swp index 2deb9f68dbe6f03a5b268b6bf6e37a48c7d306a2..ec2f9478529362620ec5b338ea323d8923ffbeea 100644 Binary files a/script/.4_parse_PSIblastTMP.R.swp and b/script/.4_parse_PSIblastTMP.sh.swp differ diff --git a/script/4_parse_PSIblastTMP.R b/script/4_parse_PSIblastTMP.R index 38af144dc65daca4453b8c32d669f21f64ea8029..68c90c714e29cf95a578a7d11ad86b1de90a60a1 100644 --- a/script/4_parse_PSIblastTMP.R +++ b/script/4_parse_PSIblastTMP.R @@ -8,7 +8,7 @@ if (length(args)!=6) { #args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast.mod", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta") #args<-c("/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/78Lp/Q5ZXP3_lpg0688/Q5ZXP3_lpg0688.fasta") -#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.3", "0.5", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta") +#args<-c("/home/mcariou/2021_legio/out_blastn/lpg2300.psiblast", "0.0001", "0.1", "0.1", "1", "/home/mcariou/2021_legio/fasta/lpg2300/lpg2300.fasta") print("reading arguments and data...") @@ -84,17 +84,34 @@ blastn$sseqid<-unlist(blastn$sseqid) blastn<-blastn[blastn$sseqid %in% listid,] print(paste("Keep best hit for each species, nrow = ", nrow(blastn))) +listid_short<-sapply(listid, function(x) substr(x, 1, nchar(x)-2)) #### Retrieve sequences via blastcmd -for (seq in listid2){ +system(paste0("rm ", fasta)) +for (seq in listid_short){ cmd<-paste0("blastdbcmd -db ~/2021_legio/blastdb/phyloref/phyloref_nuc -entry ", seq, " >> ", fasta) system(cmd) } + #### Change names in the fasta +library(ape) + +aln<-read.dna(fasta, format="fasta") + +names(listid_short)<-gsub(names(listid_short), pattern=" ", replacement="_") + +names(aln)<-sapply(names(aln), function(x) strsplit(x, " ")[[1]][1]) + +newnames<-sapply(names(aln), function(x){ + names(listid_short)[listid_short==x] +}) +names(aln)<-newnames +fasta_aln<-gsub(fasta, pattern=".fasta", replacement="_aln.fasta", fixed=TRUE) +write.dna(aln, file=fasta_aln, format="fasta", nbcol = 6, colsep = "", colw = 10) diff --git a/script/4_parse_PSIblastTMP.sh b/script/4_parse_PSIblastTMP.sh index 05ee772c9b240ac491168dd4b4410445658abe65..b1367785a41795326a10951d4f5ab54840369656 100755 --- a/script/4_parse_PSIblastTMP.sh +++ b/script/4_parse_PSIblastTMP.sh @@ -30,7 +30,7 @@ FASTA_REP=$3 mkdir -p $FASTA_REP #GENES=`grep ">" $4 | sed 's/>//g'` -Genes="lpg2300" +Gene="lpg2300" EVAL=$5 @@ -55,6 +55,7 @@ echo `wc -l $subblast` if [[ -s $FASTA ]] ; then echo "sequences already retrieved" +echo $FASTA else echo "retrieve fasta" Rscript --vanilla $rscript $OUT_BLAST $EVAL $PERCID $PERCOVERLAP $NPERTAX $FASTA > $FASTA_REP/paste_blast_test.log diff --git a/script/runscript.sh b/script/runscript.sh index 4b4ec8f392c532f2eaab50b2e426c62294c119b1..8f721b025c1fe4af4e0f2272627259a86285d4fd 100755 --- a/script/runscript.sh +++ b/script/runscript.sh @@ -10,6 +10,8 @@ ## the dirs must exist before job is launched #$ -o /home/mcariou/2021_legio/log/ #$ -e /home/mcariou/2021_legio/log/ +module load R + ./4_parse_PSIblastTMP.sh ~/2021_legio/out_blastn/lpg2300.psiblast ~/2021_legio/phylolegio/doc/tabAss.txt ~/2021_legio/fasta/lpg2300 ~/2021_legio/genes/lpg2300.fasta 0.0001 0.1 0.1 1 diff --git "a/ta_aln<-gsub(fasta, pattern=\".fasta\", replacement=\"_aln.fasta\", fixed=TRUE)" "b/ta_aln<-gsub(fasta, pattern=\".fasta\", replacement=\"_aln.fasta\", fixed=TRUE)" new file mode 100644 index 0000000000000000000000000000000000000000..38a0b9ea13410af82f3b0f07da56e3452c7c0621 --- /dev/null +++ "b/ta_aln<-gsub(fasta, pattern=\".fasta\", replacement=\"_aln.fasta\", fixed=TRUE)" @@ -0,0 +1,413 @@ +grep package:base R Documentation + +_P_a_t_t_e_r_n _M_a_t_c_h_i_n_g _a_n_d _R_e_p_l_a_c_e_m_e_n_t + +_D_e_s_c_r_i_p_t_i_o_n: + + ‘grep’, ‘grepl’, ‘regexpr’, ‘gregexpr’, ‘regexec’ and ‘gregexec’ + search for matches to argument ‘pattern’ within each element of a + character vector: they differ in the format of and amount of + detail in the results. + + ‘sub’ and ‘gsub’ perform replacement of the first and all matches + respectively. + +_U_s_a_g_e: + + grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE, + fixed = FALSE, useBytes = FALSE, invert = FALSE) + + grepl(pattern, x, ignore.case = FALSE, perl = FALSE, + fixed = FALSE, useBytes = FALSE) + + sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, + fixed = FALSE, useBytes = FALSE) + + gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, + fixed = FALSE, useBytes = FALSE) + + regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, + fixed = FALSE, useBytes = FALSE) + + gregexpr(pattern, text, ignore.case = FALSE, perl = FALSE, + fixed = FALSE, useBytes = FALSE) + + regexec(pattern, text, ignore.case = FALSE, perl = FALSE, + fixed = FALSE, useBytes = FALSE) + + gregexec(pattern, text, ignore.case = FALSE, perl = FALSE, + fixed = FALSE, useBytes = FALSE) + +_A_r_g_u_m_e_n_t_s: + + pattern: character string containing a regular expression (or + character string for ‘fixed = TRUE’) to be matched in the + given character vector. Coerced by ‘as.character’ to a + character string if possible. If a character vector of + length 2 or more is supplied, the first element is used with + a warning. Missing values are allowed except for ‘regexpr’, + ‘gregexpr’ and ‘regexec’. + + x, text: a character vector where matches are sought, or an object + which can be coerced by ‘as.character’ to a character vector. + Long vectors are supported. + +ignore.case: if ‘FALSE’, the pattern matching is _case sensitive_ and + if ‘TRUE’, case is ignored during matching. + + perl: logical. Should Perl-compatible regexps be used? + + value: if ‘FALSE’, a vector containing the (‘integer’) indices of + the matches determined by ‘grep’ is returned, and if ‘TRUE’, + a vector containing the matching elements themselves is + returned. + + fixed: logical. If ‘TRUE’, ‘pattern’ is a string to be matched as + is. Overrides all conflicting arguments. + +useBytes: logical. If ‘TRUE’ the matching is done byte-by-byte rather + than character-by-character. See ‘Details’. + + invert: logical. If ‘TRUE’ return indices or values for elements + that do _not_ match. + +replacement: a replacement for matched pattern in ‘sub’ and ‘gsub’. + Coerced to character if possible. For ‘fixed = FALSE’ this + can include backreferences ‘"\1"’ to ‘"\9"’ to parenthesized + subexpressions of ‘pattern’. For ‘perl = TRUE’ only, it can + also contain ‘"\U"’ or ‘"\L"’ to convert the rest of the + replacement to upper or lower case and ‘"\E"’ to end case + conversion. If a character vector of length 2 or more is + supplied, the first element is used with a warning. If ‘NA’, + all elements in the result corresponding to matches will be + set to ‘NA’. + +_D_e_t_a_i_l_s: + + Arguments which should be character strings or character vectors + are coerced to character if possible. + + Each of these functions operates in one of three modes: + + 1. ‘fixed = TRUE’: use exact matching. + + 2. ‘perl = TRUE’: use Perl-style regular expressions. + + 3. ‘fixed = FALSE, perl = FALSE’: use POSIX 1003.2 extended + regular expressions (the default). + + See the help pages on regular expression for details of the + different types of regular expressions. + + The two ‘*sub’ functions differ only in that ‘sub’ replaces only + the first occurrence of a ‘pattern’ whereas ‘gsub’ replaces all + occurrences. If ‘replacement’ contains backreferences which are + not defined in ‘pattern’ the result is undefined (but most often + the backreference is taken to be ‘""’). + + For ‘regexpr’, ‘gregexpr’, ‘regexec’ and ‘gregexec’ it is an error + for ‘pattern’ to be ‘NA’, otherwise ‘NA’ is permitted and gives an + ‘NA’ match. + + Both ‘grep’ and ‘grepl’ take missing values in ‘x’ as not matching + a non-missing ‘pattern’. + + The main effect of ‘useBytes = TRUE’ is to avoid errors/warnings + about invalid inputs and spurious matches in multibyte locales, + but for ‘regexpr’ it changes the interpretation of the output. It + inhibits the conversion of inputs with marked encodings, and is + forced if any input is found which is marked as ‘"bytes"’ (see + ‘Encoding’). + + Caseless matching does not make much sense for bytes in a + multibyte locale, and you should expect it only to work for ASCII + characters if ‘useBytes = TRUE’. + + ‘regexpr’ and ‘gregexpr’ with ‘perl = TRUE’ allow Python-style + named captures, but not for _long vector_ inputs. + + Invalid inputs in the current locale are warned about up to 5 + times. + + Caseless matching with ‘perl = TRUE’ for non-ASCII characters + depends on the PCRE library being compiled with ‘Unicode property + support’, which PCRE2 is by default. + +_V_a_l_u_e: + + ‘grep(value = FALSE)’ returns a vector of the indices of the + elements of ‘x’ that yielded a match (or not, for ‘invert = + TRUE’). This will be an integer vector unless the input is a + _long vector_, when it will be a double vector. + + ‘grep(value = TRUE)’ returns a character vector containing the + selected elements of ‘x’ (after coercion, preserving names but no + other attributes). + + ‘grepl’ returns a logical vector (match or not for each element of + ‘x’). + + ‘sub’ and ‘gsub’ return a character vector of the same length and + with the same attributes as ‘x’ (after possible coercion to + character). Elements of character vectors ‘x’ which are not + substituted will be returned unchanged (including any declared + encoding). If ‘useBytes = FALSE’ a non-ASCII substituted result + will often be in UTF-8 with a marked encoding (e.g., if there is a + UTF-8 input, and in a multibyte locale unless ‘fixed = TRUE’). + Such strings can be re-encoded by ‘enc2native’. + + ‘regexpr’ returns an integer vector of the same length as ‘text’ + giving the starting position of the first match or -1 if there is + none, with attribute ‘"match.length"’, an integer vector giving + the length of the matched text (or -1 for no match). The match + positions and lengths are in characters unless ‘useBytes = TRUE’ + is used, when they are in bytes (as they are for ASCII-only + matching: in either case an attribute ‘useBytes’ with value ‘TRUE’ + is set on the result). If named capture is used there are further + attributes ‘"capture.start"’, ‘"capture.length"’ and + ‘"capture.names"’. + + ‘gregexpr’ returns a list of the same length as ‘text’ each + element of which is of the same form as the return value for + ‘regexpr’, except that the starting positions of every (disjoint) + match are given. + + ‘regexec’ returns a list of the same length as ‘text’ each element + of which is either -1 if there is no match, or a sequence of + integers with the starting positions of the match and all + substrings corresponding to parenthesized subexpressions of + ‘pattern’, with attribute ‘"match.length"’ a vector giving the + lengths of the matches (or -1 for no match). The interpretation + of positions and length and the attributes follows ‘regexpr’. + + ‘gregexec’ returns the same as ‘regexec’, except that to + accommodate multiple matches per element of ‘text’, the integer + sequences for each match are made into columns of a matrix, with + one matrix per element of ‘text’ with matches. + + Where matching failed because of resource limits (especially for + ‘perl = TRUE’) this is regarded as a non-match, usually with a + warning. + +_W_a_r_n_i_n_g: + + The POSIX 1003.2 mode of ‘gsub’ and ‘gregexpr’ does not work + correctly with repeated word-boundaries (e.g., ‘pattern = "\b"’). + Use ‘perl = TRUE’ for such matches (but that may not work as + expected with non-ASCII inputs, as the meaning of ‘word’ is + system-dependent). + +_P_e_r_f_o_r_m_a_n_c_e _c_o_n_s_i_d_e_r_a_t_i_o_n_s: + + If you are doing a lot of regular expression matching, including + on very long strings, you will want to consider the options used. + Generally ‘perl = TRUE’ will be faster than the default regular + expression engine, and ‘fixed = TRUE’ faster still (especially + when each pattern is matched only a few times). + + If you are working in a single-byte locale and have marked UTF-8 + strings that are representable in that locale, convert them first + as just one UTF-8 string will force all the matching to be done in + Unicode, which attracts a penalty of around 3x for the default + POSIX 1003.2 mode. + + If you can make use of ‘useBytes = TRUE’, the strings will not be + checked before matching, and the actual matching will be faster. + Often byte-based matching suffices in a UTF-8 locale since byte + patterns of one character never match part of another. Character + ranges may produce unexpected results. + + PCRE-based matching by default used to put additional effort into + ‘studying’ the compiled pattern when ‘x’/‘text’ has length 10 or + more. That study may use the PCRE JIT compiler on platforms where + it is available (see ‘pcre_config’). As from PCRE2 (PCRE version + >= 10.00 as reported by ‘extSoftVersion’), there is no study + phase, but the patterns are optimized automatically when possible, + and PCRE JIT is used when enabled. The details are controlled by + ‘options’ ‘PCRE_study’ and ‘PCRE_use_JIT’. (Some timing + comparisons can be seen by running file ‘tests/PCRE.R’ in the R + sources (and perhaps installed).) People working with PCRE and + very long strings can adjust the maximum size of the JIT stack by + setting environment variable ‘R_PCRE_JIT_STACK_MAXSIZE’ before JIT + is used to a value between ‘1’ and ‘1000’ in MB: the default is + ‘64’. When JIT is not used with PCRE version < 10.30 (that is with + PCRE1 and old versions of PCRE2), it might also be wise to set the + option ‘PCRE_limit_recursion’. + +_N_o_t_e: + + Aspects will be platform-dependent as well as local-dependent: for + example the implementation of character classes (except + ‘[:digit:]’ and ‘[:xdigit:]’). One can expect results to be + consistent for ASCII inputs and when working in UTF-8 mode (when + most platforms will use Unicode character tables, although those + are updated frequently and subject to some degree of + interpretation - is a circled capital letter alphabetic or a + symbol?). However, results in 8-bit encodings can differ + considerably between platforms, modes and from the UTF-8 versions. + +_S_o_u_r_c_e: + + The C code for POSIX-style regular expression matching has changed + over the years. As from R 2.10.0 (Oct 2009) the TRE library of + Ville Laurikari (<URL: https://github.com/laurikari/tre>) is used. + The POSIX standard does give some room for interpretation, + especially in the handling of invalid regular expressions and the + collation of character ranges, so the results will have changed + slightly over the years. + + For Perl-style matching PCRE2 or PCRE (<URL: + https://www.pcre.org>) is used: again the results may depend + (slightly) on the version of PCRE in use. + +_R_e_f_e_r_e_n_c_e_s: + + Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) _The New S + Language_. Wadsworth & Brooks/Cole (‘grep’) + +_S_e_e _A_l_s_o: + + regular expression (aka ‘regexp’) for the details of the pattern + specification. + + ‘regmatches’ for extracting matched substrings based on the + results of ‘regexpr’, ‘gregexpr’ and ‘regexec’. + + ‘glob2rx’ to turn wildcard matches into regular expressions. + + ‘agrep’ for approximate matching. + + ‘charmatch’, ‘pmatch’ for partial matching, ‘match’ for matching + to whole strings, ‘startsWith’ for matching of initial parts of + strings. + + ‘tolower’, ‘toupper’ and ‘chartr’ for character translations. + + ‘apropos’ uses regexps and has more examples. + + ‘grepRaw’ for matching raw vectors. + + Options ‘PCRE_limit_recursion’, ‘PCRE_study’ and ‘PCRE_use_JIT’. + + ‘extSoftVersion’ for the versions of regex and PCRE libraries in + use, ‘pcre_config’ for more details for PCRE. + +_E_x_a_m_p_l_e_s: + + grep("[a-z]", letters) + + txt <- c("arm","foot","lefroo", "bafoobar") + if(length(i <- grep("foo", txt))) + cat("'foo' appears at least once in\n\t", txt, "\n") + i # 2 and 4 + txt[i] + + ## Double all 'a' or 'b's; "\" must be escaped, i.e., 'doubled' + gsub("([ab])", "\\1_\\1_", "abc and ABC") + + txt <- c("The", "licenses", "for", "most", "software", "are", + "designed", "to", "take", "away", "your", "freedom", + "to", "share", "and", "change", "it.", + "", "By", "contrast,", "the", "GNU", "General", "Public", "License", + "is", "intended", "to", "guarantee", "your", "freedom", "to", + "share", "and", "change", "free", "software", "--", + "to", "make", "sure", "the", "software", "is", + "free", "for", "all", "its", "users") + ( i <- grep("[gu]", txt) ) # indices + stopifnot( txt[i] == grep("[gu]", txt, value = TRUE) ) + + ## Note that for some implementations character ranges are + ## locale-dependent (but not currently). Then [b-e] in locales such as + ## en_US may include B as the collation order is aAbBcCdDe ... + (ot <- sub("[b-e]",".", txt)) + txt[ot != gsub("[b-e]",".", txt)]#- gsub does "global" substitution + ## In caseless matching, ranges include both cases: + a <- grep("[b-e]", txt, value = TRUE) + b <- grep("[b-e]", txt, ignore.case = TRUE, value = TRUE) + setdiff(b, a) + + txt[gsub("g","#", txt) != + gsub("g","#", txt, ignore.case = TRUE)] # the "G" words + + regexpr("en", txt) + + gregexpr("e", txt) + + ## Using grepl() for filtering + ## Find functions with argument names matching "warn": + findArgs <- function(env, pattern) { + nms <- ls(envir = as.environment(env)) + nms <- nms[is.na(match(nms, c("F","T")))] # <-- work around "checking hack" + aa <- sapply(nms, function(.) { o <- get(.) + if(is.function(o)) names(formals(o)) }) + iw <- sapply(aa, function(a) any(grepl(pattern, a, ignore.case=TRUE))) + aa[iw] + } + findArgs("package:base", "warn") + + ## trim trailing white space + str <- "Now is the time " + sub(" +$", "", str) ## spaces only + ## what is considered 'white space' depends on the locale. + sub("[[:space:]]+$", "", str) ## white space, POSIX-style + ## what PCRE considered white space changed in version 8.34: see ?regex + sub("\\s+$", "", str, perl = TRUE) ## PCRE-style white space + + ## capitalizing + txt <- "a test of capitalizing" + gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE) + gsub("\\b(\\w)", "\\U\\1", txt, perl=TRUE) + + txt2 <- "useRs may fly into JFK or laGuardia" + gsub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE) + sub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE) + + ## named capture + notables <- c(" Ben Franklin and Jefferson Davis", + "\tMillard Fillmore") + # name groups 'first' and 'last' + name.rex <- "(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)" + (parsed <- regexpr(name.rex, notables, perl = TRUE)) + gregexpr(name.rex, notables, perl = TRUE)[[2]] + parse.one <- function(res, result) { + m <- do.call(rbind, lapply(seq_along(res), function(i) { + if(result[i] == -1) return("") + st <- attr(result, "capture.start")[i, ] + substring(res[i], st, st + attr(result, "capture.length")[i, ] - 1) + })) + colnames(m) <- attr(result, "capture.names") + m + } + parse.one(notables, parsed) + + ## Decompose a URL into its components. + ## Example by LT (http://www.cs.uiowa.edu/~luke/R/regexp.html). + x <- "http://stat.umn.edu:80/xyz" + m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x) + m + regmatches(x, m) + ## Element 3 is the protocol, 4 is the host, 6 is the port, and 7 + ## is the path. We can use this to make a function for extracting the + ## parts of a URL: + URL_parts <- function(x) { + m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x) + parts <- do.call(rbind, + lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L))) + colnames(parts) <- c("protocol","host","port","path") + parts + } + URL_parts(x) + + ## gregexec() may match multiple times within a single string. + pattern <- "([[:alpha:]]+)([[:digit:]]+)" + s <- "Test: A1 BC23 DEF456" + m <- gregexec(pattern, s) + m + regmatches(s, m) + + ## Before gregexec() was implemented, one could emulate it by running + ## regexec() on the regmatches obtained via gregexpr(). E.g.: + lapply(regmatches(s, gregexpr(pattern, s)), + function(e) regmatches(e, regexec(pattern, e))) + +