diff --git "a/ta_aln<-gsub(fasta, pattern=\".fasta\", replacement=\"_aln.fasta\", fixed=TRUE)" "b/ta_aln<-gsub(fasta, pattern=\".fasta\", replacement=\"_aln.fasta\", fixed=TRUE)" deleted file mode 100644 index 38a0b9ea13410af82f3b0f07da56e3452c7c0621..0000000000000000000000000000000000000000 --- "a/ta_aln<-gsub(fasta, pattern=\".fasta\", replacement=\"_aln.fasta\", fixed=TRUE)" +++ /dev/null @@ -1,413 +0,0 @@ -grep package:base R Documentation - -_P_a_t_t_e_r_n _M_a_t_c_h_i_n_g _a_n_d _R_e_p_l_a_c_e_m_e_n_t - -_D_e_s_c_r_i_p_t_i_o_n: - - ‘grep’, ‘grepl’, ‘regexpr’, ‘gregexpr’, ‘regexec’ and ‘gregexec’ - search for matches to argument ‘pattern’ within each element of a - character vector: they differ in the format of and amount of - detail in the results. - - ‘sub’ and ‘gsub’ perform replacement of the first and all matches - respectively. - -_U_s_a_g_e: - - grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE, - fixed = FALSE, useBytes = FALSE, invert = FALSE) - - grepl(pattern, x, ignore.case = FALSE, perl = FALSE, - fixed = FALSE, useBytes = FALSE) - - sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, - fixed = FALSE, useBytes = FALSE) - - gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, - fixed = FALSE, useBytes = FALSE) - - regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, - fixed = FALSE, useBytes = FALSE) - - gregexpr(pattern, text, ignore.case = FALSE, perl = FALSE, - fixed = FALSE, useBytes = FALSE) - - regexec(pattern, text, ignore.case = FALSE, perl = FALSE, - fixed = FALSE, useBytes = FALSE) - - gregexec(pattern, text, ignore.case = FALSE, perl = FALSE, - fixed = FALSE, useBytes = FALSE) - -_A_r_g_u_m_e_n_t_s: - - pattern: character string containing a regular expression (or - character string for ‘fixed = TRUE’) to be matched in the - given character vector. Coerced by ‘as.character’ to a - character string if possible. If a character vector of - length 2 or more is supplied, the first element is used with - a warning. Missing values are allowed except for ‘regexpr’, - ‘gregexpr’ and ‘regexec’. - - x, text: a character vector where matches are sought, or an object - which can be coerced by ‘as.character’ to a character vector. - Long vectors are supported. - -ignore.case: if ‘FALSE’, the pattern matching is _case sensitive_ and - if ‘TRUE’, case is ignored during matching. - - perl: logical. Should Perl-compatible regexps be used? - - value: if ‘FALSE’, a vector containing the (‘integer’) indices of - the matches determined by ‘grep’ is returned, and if ‘TRUE’, - a vector containing the matching elements themselves is - returned. - - fixed: logical. If ‘TRUE’, ‘pattern’ is a string to be matched as - is. Overrides all conflicting arguments. - -useBytes: logical. If ‘TRUE’ the matching is done byte-by-byte rather - than character-by-character. See ‘Details’. - - invert: logical. If ‘TRUE’ return indices or values for elements - that do _not_ match. - -replacement: a replacement for matched pattern in ‘sub’ and ‘gsub’. - Coerced to character if possible. For ‘fixed = FALSE’ this - can include backreferences ‘"\1"’ to ‘"\9"’ to parenthesized - subexpressions of ‘pattern’. For ‘perl = TRUE’ only, it can - also contain ‘"\U"’ or ‘"\L"’ to convert the rest of the - replacement to upper or lower case and ‘"\E"’ to end case - conversion. If a character vector of length 2 or more is - supplied, the first element is used with a warning. If ‘NA’, - all elements in the result corresponding to matches will be - set to ‘NA’. - -_D_e_t_a_i_l_s: - - Arguments which should be character strings or character vectors - are coerced to character if possible. - - Each of these functions operates in one of three modes: - - 1. ‘fixed = TRUE’: use exact matching. - - 2. ‘perl = TRUE’: use Perl-style regular expressions. - - 3. ‘fixed = FALSE, perl = FALSE’: use POSIX 1003.2 extended - regular expressions (the default). - - See the help pages on regular expression for details of the - different types of regular expressions. - - The two ‘*sub’ functions differ only in that ‘sub’ replaces only - the first occurrence of a ‘pattern’ whereas ‘gsub’ replaces all - occurrences. If ‘replacement’ contains backreferences which are - not defined in ‘pattern’ the result is undefined (but most often - the backreference is taken to be ‘""’). - - For ‘regexpr’, ‘gregexpr’, ‘regexec’ and ‘gregexec’ it is an error - for ‘pattern’ to be ‘NA’, otherwise ‘NA’ is permitted and gives an - ‘NA’ match. - - Both ‘grep’ and ‘grepl’ take missing values in ‘x’ as not matching - a non-missing ‘pattern’. - - The main effect of ‘useBytes = TRUE’ is to avoid errors/warnings - about invalid inputs and spurious matches in multibyte locales, - but for ‘regexpr’ it changes the interpretation of the output. It - inhibits the conversion of inputs with marked encodings, and is - forced if any input is found which is marked as ‘"bytes"’ (see - ‘Encoding’). - - Caseless matching does not make much sense for bytes in a - multibyte locale, and you should expect it only to work for ASCII - characters if ‘useBytes = TRUE’. - - ‘regexpr’ and ‘gregexpr’ with ‘perl = TRUE’ allow Python-style - named captures, but not for _long vector_ inputs. - - Invalid inputs in the current locale are warned about up to 5 - times. - - Caseless matching with ‘perl = TRUE’ for non-ASCII characters - depends on the PCRE library being compiled with ‘Unicode property - support’, which PCRE2 is by default. - -_V_a_l_u_e: - - ‘grep(value = FALSE)’ returns a vector of the indices of the - elements of ‘x’ that yielded a match (or not, for ‘invert = - TRUE’). This will be an integer vector unless the input is a - _long vector_, when it will be a double vector. - - ‘grep(value = TRUE)’ returns a character vector containing the - selected elements of ‘x’ (after coercion, preserving names but no - other attributes). - - ‘grepl’ returns a logical vector (match or not for each element of - ‘x’). - - ‘sub’ and ‘gsub’ return a character vector of the same length and - with the same attributes as ‘x’ (after possible coercion to - character). Elements of character vectors ‘x’ which are not - substituted will be returned unchanged (including any declared - encoding). If ‘useBytes = FALSE’ a non-ASCII substituted result - will often be in UTF-8 with a marked encoding (e.g., if there is a - UTF-8 input, and in a multibyte locale unless ‘fixed = TRUE’). - Such strings can be re-encoded by ‘enc2native’. - - ‘regexpr’ returns an integer vector of the same length as ‘text’ - giving the starting position of the first match or -1 if there is - none, with attribute ‘"match.length"’, an integer vector giving - the length of the matched text (or -1 for no match). The match - positions and lengths are in characters unless ‘useBytes = TRUE’ - is used, when they are in bytes (as they are for ASCII-only - matching: in either case an attribute ‘useBytes’ with value ‘TRUE’ - is set on the result). If named capture is used there are further - attributes ‘"capture.start"’, ‘"capture.length"’ and - ‘"capture.names"’. - - ‘gregexpr’ returns a list of the same length as ‘text’ each - element of which is of the same form as the return value for - ‘regexpr’, except that the starting positions of every (disjoint) - match are given. - - ‘regexec’ returns a list of the same length as ‘text’ each element - of which is either -1 if there is no match, or a sequence of - integers with the starting positions of the match and all - substrings corresponding to parenthesized subexpressions of - ‘pattern’, with attribute ‘"match.length"’ a vector giving the - lengths of the matches (or -1 for no match). The interpretation - of positions and length and the attributes follows ‘regexpr’. - - ‘gregexec’ returns the same as ‘regexec’, except that to - accommodate multiple matches per element of ‘text’, the integer - sequences for each match are made into columns of a matrix, with - one matrix per element of ‘text’ with matches. - - Where matching failed because of resource limits (especially for - ‘perl = TRUE’) this is regarded as a non-match, usually with a - warning. - -_W_a_r_n_i_n_g: - - The POSIX 1003.2 mode of ‘gsub’ and ‘gregexpr’ does not work - correctly with repeated word-boundaries (e.g., ‘pattern = "\b"’). - Use ‘perl = TRUE’ for such matches (but that may not work as - expected with non-ASCII inputs, as the meaning of ‘word’ is - system-dependent). - -_P_e_r_f_o_r_m_a_n_c_e _c_o_n_s_i_d_e_r_a_t_i_o_n_s: - - If you are doing a lot of regular expression matching, including - on very long strings, you will want to consider the options used. - Generally ‘perl = TRUE’ will be faster than the default regular - expression engine, and ‘fixed = TRUE’ faster still (especially - when each pattern is matched only a few times). - - If you are working in a single-byte locale and have marked UTF-8 - strings that are representable in that locale, convert them first - as just one UTF-8 string will force all the matching to be done in - Unicode, which attracts a penalty of around 3x for the default - POSIX 1003.2 mode. - - If you can make use of ‘useBytes = TRUE’, the strings will not be - checked before matching, and the actual matching will be faster. - Often byte-based matching suffices in a UTF-8 locale since byte - patterns of one character never match part of another. Character - ranges may produce unexpected results. - - PCRE-based matching by default used to put additional effort into - ‘studying’ the compiled pattern when ‘x’/‘text’ has length 10 or - more. That study may use the PCRE JIT compiler on platforms where - it is available (see ‘pcre_config’). As from PCRE2 (PCRE version - >= 10.00 as reported by ‘extSoftVersion’), there is no study - phase, but the patterns are optimized automatically when possible, - and PCRE JIT is used when enabled. The details are controlled by - ‘options’ ‘PCRE_study’ and ‘PCRE_use_JIT’. (Some timing - comparisons can be seen by running file ‘tests/PCRE.R’ in the R - sources (and perhaps installed).) People working with PCRE and - very long strings can adjust the maximum size of the JIT stack by - setting environment variable ‘R_PCRE_JIT_STACK_MAXSIZE’ before JIT - is used to a value between ‘1’ and ‘1000’ in MB: the default is - ‘64’. When JIT is not used with PCRE version < 10.30 (that is with - PCRE1 and old versions of PCRE2), it might also be wise to set the - option ‘PCRE_limit_recursion’. - -_N_o_t_e: - - Aspects will be platform-dependent as well as local-dependent: for - example the implementation of character classes (except - ‘[:digit:]’ and ‘[:xdigit:]’). One can expect results to be - consistent for ASCII inputs and when working in UTF-8 mode (when - most platforms will use Unicode character tables, although those - are updated frequently and subject to some degree of - interpretation - is a circled capital letter alphabetic or a - symbol?). However, results in 8-bit encodings can differ - considerably between platforms, modes and from the UTF-8 versions. - -_S_o_u_r_c_e: - - The C code for POSIX-style regular expression matching has changed - over the years. As from R 2.10.0 (Oct 2009) the TRE library of - Ville Laurikari (<URL: https://github.com/laurikari/tre>) is used. - The POSIX standard does give some room for interpretation, - especially in the handling of invalid regular expressions and the - collation of character ranges, so the results will have changed - slightly over the years. - - For Perl-style matching PCRE2 or PCRE (<URL: - https://www.pcre.org>) is used: again the results may depend - (slightly) on the version of PCRE in use. - -_R_e_f_e_r_e_n_c_e_s: - - Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) _The New S - Language_. Wadsworth & Brooks/Cole (‘grep’) - -_S_e_e _A_l_s_o: - - regular expression (aka ‘regexp’) for the details of the pattern - specification. - - ‘regmatches’ for extracting matched substrings based on the - results of ‘regexpr’, ‘gregexpr’ and ‘regexec’. - - ‘glob2rx’ to turn wildcard matches into regular expressions. - - ‘agrep’ for approximate matching. - - ‘charmatch’, ‘pmatch’ for partial matching, ‘match’ for matching - to whole strings, ‘startsWith’ for matching of initial parts of - strings. - - ‘tolower’, ‘toupper’ and ‘chartr’ for character translations. - - ‘apropos’ uses regexps and has more examples. - - ‘grepRaw’ for matching raw vectors. - - Options ‘PCRE_limit_recursion’, ‘PCRE_study’ and ‘PCRE_use_JIT’. - - ‘extSoftVersion’ for the versions of regex and PCRE libraries in - use, ‘pcre_config’ for more details for PCRE. - -_E_x_a_m_p_l_e_s: - - grep("[a-z]", letters) - - txt <- c("arm","foot","lefroo", "bafoobar") - if(length(i <- grep("foo", txt))) - cat("'foo' appears at least once in\n\t", txt, "\n") - i # 2 and 4 - txt[i] - - ## Double all 'a' or 'b's; "\" must be escaped, i.e., 'doubled' - gsub("([ab])", "\\1_\\1_", "abc and ABC") - - txt <- c("The", "licenses", "for", "most", "software", "are", - "designed", "to", "take", "away", "your", "freedom", - "to", "share", "and", "change", "it.", - "", "By", "contrast,", "the", "GNU", "General", "Public", "License", - "is", "intended", "to", "guarantee", "your", "freedom", "to", - "share", "and", "change", "free", "software", "--", - "to", "make", "sure", "the", "software", "is", - "free", "for", "all", "its", "users") - ( i <- grep("[gu]", txt) ) # indices - stopifnot( txt[i] == grep("[gu]", txt, value = TRUE) ) - - ## Note that for some implementations character ranges are - ## locale-dependent (but not currently). Then [b-e] in locales such as - ## en_US may include B as the collation order is aAbBcCdDe ... - (ot <- sub("[b-e]",".", txt)) - txt[ot != gsub("[b-e]",".", txt)]#- gsub does "global" substitution - ## In caseless matching, ranges include both cases: - a <- grep("[b-e]", txt, value = TRUE) - b <- grep("[b-e]", txt, ignore.case = TRUE, value = TRUE) - setdiff(b, a) - - txt[gsub("g","#", txt) != - gsub("g","#", txt, ignore.case = TRUE)] # the "G" words - - regexpr("en", txt) - - gregexpr("e", txt) - - ## Using grepl() for filtering - ## Find functions with argument names matching "warn": - findArgs <- function(env, pattern) { - nms <- ls(envir = as.environment(env)) - nms <- nms[is.na(match(nms, c("F","T")))] # <-- work around "checking hack" - aa <- sapply(nms, function(.) { o <- get(.) - if(is.function(o)) names(formals(o)) }) - iw <- sapply(aa, function(a) any(grepl(pattern, a, ignore.case=TRUE))) - aa[iw] - } - findArgs("package:base", "warn") - - ## trim trailing white space - str <- "Now is the time " - sub(" +$", "", str) ## spaces only - ## what is considered 'white space' depends on the locale. - sub("[[:space:]]+$", "", str) ## white space, POSIX-style - ## what PCRE considered white space changed in version 8.34: see ?regex - sub("\\s+$", "", str, perl = TRUE) ## PCRE-style white space - - ## capitalizing - txt <- "a test of capitalizing" - gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE) - gsub("\\b(\\w)", "\\U\\1", txt, perl=TRUE) - - txt2 <- "useRs may fly into JFK or laGuardia" - gsub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE) - sub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE) - - ## named capture - notables <- c(" Ben Franklin and Jefferson Davis", - "\tMillard Fillmore") - # name groups 'first' and 'last' - name.rex <- "(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)" - (parsed <- regexpr(name.rex, notables, perl = TRUE)) - gregexpr(name.rex, notables, perl = TRUE)[[2]] - parse.one <- function(res, result) { - m <- do.call(rbind, lapply(seq_along(res), function(i) { - if(result[i] == -1) return("") - st <- attr(result, "capture.start")[i, ] - substring(res[i], st, st + attr(result, "capture.length")[i, ] - 1) - })) - colnames(m) <- attr(result, "capture.names") - m - } - parse.one(notables, parsed) - - ## Decompose a URL into its components. - ## Example by LT (http://www.cs.uiowa.edu/~luke/R/regexp.html). - x <- "http://stat.umn.edu:80/xyz" - m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x) - m - regmatches(x, m) - ## Element 3 is the protocol, 4 is the host, 6 is the port, and 7 - ## is the path. We can use this to make a function for extracting the - ## parts of a URL: - URL_parts <- function(x) { - m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x) - parts <- do.call(rbind, - lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L))) - colnames(parts) <- c("protocol","host","port","path") - parts - } - URL_parts(x) - - ## gregexec() may match multiple times within a single string. - pattern <- "([[:alpha:]]+)([[:digit:]]+)" - s <- "Test: A1 BC23 DEF456" - m <- gregexec(pattern, s) - m - regmatches(s, m) - - ## Before gregexec() was implemented, one could emulate it by running - ## regexec() on the regmatches obtained via gregexpr(). E.g.: - lapply(regmatches(s, gregexpr(pattern, s)), - function(e) regmatches(e, regexec(pattern, e))) - -