# File src/library/base/R/grep.R # Part of the R package, https://www.R-project.org # # Copyright (C) 1995-2015 The R Core Team # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # A copy of the GNU General Public License is available at # https://www.R-project.org/Licenses/ ## Q: Why are we using as.character(.) all over the place instead of doing that in C ? ## A: These must work for objects which have their own as.character(.) methods *and* ## as.character() is fast [Primitive] strsplit <- function(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE) .Internal(strsplit(x, as.character(split), fixed, perl, useBytes)) grep <- function(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE, fixed = FALSE, useBytes = FALSE, invert = FALSE) { ## when value = TRUE we return names if(!is.character(x)) x <- structure(as.character(x), names=names(x)) .Internal(grep(as.character(pattern), x, ignore.case, value, perl, fixed, useBytes, invert)) } grepl <- function(pattern, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) { if(!is.character(x)) x <- as.character(x) .Internal(grepl(as.character(pattern), x, ignore.case, FALSE, perl, fixed, useBytes, FALSE)) } sub <- function(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) { if (!is.character(x)) x <- as.character(x) .Internal(sub(as.character(pattern), as.character(replacement), x, ignore.case, perl, fixed, useBytes)) } gsub <- function(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) { if (!is.character(x)) x <- as.character(x) .Internal(gsub(as.character(pattern), as.character(replacement), x, ignore.case, perl, fixed, useBytes)) } regexpr <- function(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) { if (!is.character(text)) text <- as.character(text) .Internal(regexpr(as.character(pattern), text, ignore.case, perl, fixed, useBytes)) } gregexpr <- function(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) { if (!is.character(text)) text <- as.character(text) .Internal(gregexpr(as.character(pattern), text, ignore.case, perl, fixed, useBytes)) } grepRaw <- function(pattern, x, offset = 1L, ignore.case = FALSE, value = FALSE, fixed = FALSE, all = FALSE, invert = FALSE) { if (!is.raw(pattern)) pattern <- charToRaw(as.character(pattern)) if (!is.raw(x)) x <- charToRaw(as.character(x)) .Internal(grepRaw(pattern, x, offset, ignore.case, fixed, value, all, invert)) } regexec <- function(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) { if (!is.character(text)) text <- as.character(text) if(!perl || fixed) return(.Internal(regexec(as.character(pattern), text, ignore.case, fixed, useBytes))) ## For perl = TRUE, re-use regexpr(perl = TRUE) which always ## captures subexpressions. match_data_from_pos_and_len <- function(pos, len) { attr(pos, "match.length") <- len pos } m <- regexpr(pattern, text, ignore.case = ignore.case, useBytes = useBytes, perl = TRUE) y <- vector("list", length(text)) y[is.na(m)] <- list(match_data_from_pos_and_len(NA_integer_, NA_integer_)) ind <- !is.na(m) & (m == -1L) if(any(ind)) { y[ind] <- list(match_data_from_pos_and_len(-1L, -1L)) } ind <- !is.na(m) & !ind if(any(ind)) { pos <- cbind(m[ind], attr(m, "capture.start")[ind, , drop = FALSE]) len <- cbind(attr(m, "match.length")[ind], attr(m, "capture.length")[ind, , drop = FALSE]) y[ind] <- Map(match_data_from_pos_and_len, split(pos, row(pos)), split(len, row(len))) } if(identical(attr(m, "useBytes"), TRUE)) y <- lapply(y, `attr<-`, "useBytes", TRUE) y } agrep <- function(pattern, x, max.distance = 0.1, costs = NULL, ignore.case = FALSE, value = FALSE, fixed = TRUE, useBytes = FALSE) { pattern <- as.character(pattern) if(!is.character(x)) x <- as.character(x) ## TRE needs integer costs: coerce here for simplicity. costs <- as.integer(.amatch_costs(costs)) bounds <- .amatch_bounds(max.distance) .Internal(agrep(pattern, x, ignore.case, value, costs, bounds, useBytes, fixed)) } agrepl <- function(pattern, x, max.distance = 0.1, costs = NULL, ignore.case = FALSE, fixed = TRUE, useBytes = FALSE) { pattern <- as.character(pattern) if(!is.character(x)) x <- as.character(x) ## TRE needs integer costs: coerce here for simplicity. costs <- as.integer(.amatch_costs(costs)) bounds <- .amatch_bounds(max.distance) .Internal(agrepl(pattern, x, ignore.case, FALSE, costs, bounds, useBytes, fixed)) } .amatch_bounds <- function(x = 0.1) { ## Expand max match distance argument for agrep() et al into bounds ## for the TRE regaparams struct. ## Note that TRE allows for possibly different (integer) costs for ## insertions, deletions and substitions, and allows for specifying ## separate bounds for these numbers as well as the total number of ## "errors" (transformations) and the total cost. ## ## When using unit costs (and older versions of agrep() did not ## allow otherwise), the total number of errors is the same as the ## total cost, and bounds on the total number of errors imply the ## same bounds for the individual transformation counts. This no ## longer holds when using possibly different costs. ## ## See ? agrep for details on handling the match distance argument. ## ## Older versions of agrep() expanded fractions (of the pattern ## length) in R code: but as the C code determines whether matching ## used bytes or characters, only the C code can determine the ## pattern length and hence expand fractions. ## ## Unspecified bounds are taken as NA_real_, and set to INT_MAX by ## the C code. if(!is.list(x)) { ## Sanity checks. if(!is.numeric(x) || (x < 0)) stop("match distance components must be non-negative") bounds <- c(as.double(x), rep.int(NA_real_, 4L)) } else { table <- c("cost", "insertions", "deletions", "substitutions", "all") ## Partial matching. pos <- pmatch(names(x), table) if(anyNA(pos)) { warning("unknown match distance components ignored") x <- x[!is.na(pos)] } names(x) <- table[pos] ## Sanity checks. x <- unlist(x) if(!all(is.numeric(x)) || any(x < 0)) stop("match distance components must be non-negative") ## Defaults. if(!is.na(x["cost"])) { bounds <- rep.int(NA_real_, 5L) } else { ## If 'cost' is missing: if 'all' is missing it is set to ## 0.1, and the other transformation number bounds default ## to 'all'. if(is.na(x["all"])) x["all"] <- 0.1 bounds <- c(NA_real_, rep.int(x["all"], 4L)) } names(bounds) <- table bounds[names(x)] <- x } bounds } .amatch_costs <- function(x = NULL) { costs <- c(insertions = 1, deletions = 1, substitutions = 1) if(!is.null(x)) { x <- as.list(x) ## Partial matching. pos <- pmatch(names(x), names(costs)) if(anyNA(pos)) { warning("unknown cost components ignored") x <- x[!is.na(pos)] } ## Sanity checks. x <- unlist(x) if(!all(is.numeric(x)) || any(x < 0)) stop("cost components must be non-negative") costs[pos] <- x } costs } regmatches <- function(x, m, invert = FALSE) { if(length(x) != length(m)) stop(gettextf("%s and %s must have the same length", sQuote("x"), sQuote("m")), domain = NA) ili <- is.list(m) ## Handle useBytes/encoding issues. ## Match positions from regexpr(), gregexpr() and regexec() are in ## characters unless 'useBytes = TRUE' was given, now recorded via ## the 'index.type' attribute (in addition to the 'useBytes' one ## being TRUE when 'useBytes = TRUE' was given *or* all character ## string involved were ASCII). ## To convince substring() and nchar() used below accordingly that ## match data positions are in bytes, we set the input encoding to ## "bytes" for the former and call the latter with 'type = "bytes"'. itype <- "chars" useBytes <- if(ili) any(unlist(lapply(m, attr, "index.type")) == "bytes") else any(attr(m, "index.type") == "bytes") if(useBytes) { itype <- Encoding(x) <- "bytes" } ## For NA matches (from matching a non-NA pattern on an NA string), ## direct matches give nothing and inverse matches give NA (as ## nothing was matched). if(!ili && isFALSE(invert)) { so <- m[ind <- (!is.na(m) & (m > -1L))] eo <- so + attr(m, "match.length")[ind] - 1L return(substring(x[ind], so, eo)) } y <- if(is.na(invert)) { Map(function(u, so, ml) { if((n <- length(so)) == 1L) { if(is.na(so) ) return(NA_character_) # Or u ... else if(so == -1L) return(u) } eo <- so + ml - 1L if(n > 1L) { ## regexec() could give overlapping matches. ## Matches are non-overlapping iff ## eo[i] < so[i + 1], i = 1, ..., n - 1. if(any(eo[-n] >= so[-1L])) stop(gettextf("need non-overlapping matches for %s", sQuote("invert = NA")), domain = NA) } beg <- c(1L, c(rbind(so, eo + 1L))) end <- c(c(rbind(so - 1L, eo)), nchar(u, itype)) substring(u, beg, end) }, x, m, if(ili) lapply(m, attr, "match.length") else attr(m, "match.length"), USE.NAMES = FALSE) } else if(invert) { Map(function(u, so, ml) { if((n <- length(so)) == 1L) { if(is.na(so) ) return(NA_character_) # Or u ... else if(so == -1L) return(u) } beg <- if(n > 1L) { ## See above. eo <- so + ml - 1L if(any(eo[-n] >= so[-1L])) stop(gettextf("need non-overlapping matches for %s", sQuote("invert = TRUE")), domain = NA) c(1L, eo + 1L) } else { c(1L, so + ml) } end <- c(so - 1L, nchar(u, itype)) substring(u, beg, end) }, x, m, if(ili) lapply(m, attr, "match.length") else attr(m, "match.length"), USE.NAMES = FALSE) } else { Map(function(u, so, ml) { if(length(so) == 1L) { if(is.na(so) || (so == -1L)) return(character()) } substring(u, so, so + ml - 1L) }, x, m, lapply(m, attr, "match.length"), USE.NAMES = FALSE) } names(y) <- names(x) y } ## Suppose matching partitions a string as ## n0 m1 n1 ... mk nk ## where the m and n substrings are the matched and non-matched parts, ## respectively, and n0 and/or nk can be empty. ## (regexec() can give overlapping matches, in which case extracting ## inverted matches or replacing cannot work.) ## For list match data, k can be any non-negative integer. ## Extraction and replacement straightforwardly work on the m or n ## sequences, depending on whether invert is FALSE or TRUE. ## For vector match data from regexpr(), k can be 0 or 1. ## If k = 0 (no match): ## invert ## FALSE TRUE ## extract drop n0 ## replace n0 r0 ## If k = 1: ## invert ## FALSE TRUE ## extract m1 c(n0, n1) ## replace n0 r1 n1 r0 m1 r1 `regmatches<-` <- function(x, m, invert = FALSE, value) { if(!length(x)) return(x) ili <- is.list(m) if(!ili && invert && any(m == -1L)) { ## regmatches() drops empty matches for vector match data if ## invert is FALSE (see above): we need to work around this when ## replacing non-matches (PR #15723). y <- rep_len(list(character()), length(x)) y[m > -1L] <- as.list(regmatches(x, m, FALSE)) } else { y <- regmatches(x, m, !invert) } ## ## It might be simpler to try reducing the vector case to the list ## case, transforming m and value as needed, ## if(!ili && !invert) { ## For non-list m and invert = FALSE, we need a character vector ## of replacement values with length the number of matched ## elements. value <- as.character(value) if(anyNA(value)) stop("missing replacement values are not allowed") ## Entries for matched elements have length 2. pos <- which(lengths(y) == 2L) np <- length(pos) nv <- length(value) if(np != nv) { if(!nv) stop("must have replacement values for matches") value <- rep_len(value, np) } y <- y[pos] x[pos] <- paste0(sapply(y, `[`, 1L), value, sapply(y, `[`, 2L)) return(x) } ## We need a list of character vectors without missings, which has ## the same length as x. value <- lapply(value, as.character) if(anyNA(value)) # {recursively!} stop("missing replacement values are not allowed") if(!length(value)) stop("value does not provide any replacement values") value <- rep_len(value, length(x)) y <- if(invert) { ## Replace non-matches. ## An element of x with k matches has a corresponding y element ## of length k, and needs k + 1 replacement values. Map(function(u, v) { nu <- length(u) nv <- length(v) if(nv != (nu + 1L)) { if(!nv) stop("must have replacements for non-matches") v <- rep_len(v, nu + 1L) } paste0(v, c(u, ""), collapse = "") }, y, value, USE.NAMES = FALSE) } else { ## Replace matches. ## An element of x with k matches has a corresponding y element ## of length k + 1, and needs k replacement values. Map(function(u, v) { nu <- length(u) nv <- length(v) if(nv != (nu - 1L)) { if(!nv) stop("must have replacements for matches") v <- rep_len(v, nu - 1L) } paste0(u, c(v, ""), collapse = "") }, y, value, USE.NAMES = FALSE) } y <- unlist(y) names(y) <- names(x) y } pcre_config <- function() .Internal(pcre_config())