/* * R : A Computer Language for Statistical Data Analysis * Copyright (C) 2002--2023 The R Core Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Pulic License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, a copy is available at * https://www.R-project.org/Licenses/ */ /* This at times needed to be separate from grep.c, as TRE has a conflicting regcomp and the two headers cannot both be included in one file */ #ifdef HAVE_CONFIG_H # include #endif #include #include /* This is remapped */ #undef pmatch /* interval at which to check interrupts */ /* if re-enabling, consider a power of two */ /* #define NINTERRUPT 1000000 */ #include /* for R_Calloc/R_Free */ #include #include static void amatch_regaparams(regaparams_t *params, int patlen, double *bounds, int *costs) { int cost, max_cost, warn = 0; double bound; cost = params->cost_ins = max_cost = costs[0]; cost = params->cost_del = costs[1]; if(cost > max_cost) max_cost = cost; cost = params->cost_subst = costs[2]; if(cost > max_cost) max_cost = cost; bound = bounds[0]; if(ISNA(bound)) { params->max_cost = INT_MAX; } else { if(bound < 1) bound *= (patlen * max_cost); params->max_cost = IntegerFromReal(ceil(bound), &warn); CoercionWarning(warn); } bound = bounds[1]; if(ISNA(bound)) { params->max_del = INT_MAX; } else { if(bound < 1) bound *= patlen; params->max_del = IntegerFromReal(ceil(bound), &warn); CoercionWarning(warn); } bound = bounds[2]; if(ISNA(bound)) { params->max_ins = INT_MAX; } else { if(bound < 1) bound *= patlen; params->max_ins = IntegerFromReal(ceil(bound), &warn); CoercionWarning(warn); } bound = bounds[3]; if(ISNA(bound)) { params->max_subst = INT_MAX; } else { if(bound < 1) bound *= patlen; params->max_subst = IntegerFromReal(ceil(bound), &warn); CoercionWarning(warn); } bound = bounds[4]; if(ISNA(bound)) { params->max_err = INT_MAX; } else { if(bound < 1) bound *= patlen; params->max_err = IntegerFromReal(ceil(bound), &warn); CoercionWarning(warn); } } attribute_hidden SEXP do_agrep(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP pat, vec, ind, ans; SEXP opt_costs, opt_bounds; int opt_icase, opt_value, opt_fixed, useBytes; R_xlen_t i, j, n; int nmatches, patlen; Rboolean useWC = FALSE; const void *vmax = NULL; regex_t reg; regaparams_t params; regamatch_t match; int rc, cflags = REG_EXTENDED | REG_NOSUB; checkArity(op, args); pat = CAR(args); args = CDR(args); vec = CAR(args); args = CDR(args); opt_icase = asLogical(CAR(args)); args = CDR(args); opt_value = asLogical(CAR(args)); args = CDR(args); opt_costs = CAR(args); args = CDR(args); opt_bounds = CAR(args); args = CDR(args); useBytes = asLogical(CAR(args)); args = CDR(args); opt_fixed = asLogical(CAR(args)); if(opt_icase == NA_INTEGER) opt_icase = 0; if(opt_value == NA_INTEGER) opt_value = 0; if(useBytes == NA_INTEGER) useBytes = 0; if(opt_fixed == NA_INTEGER) opt_fixed = 1; if(opt_fixed) cflags |= REG_LITERAL; if(!isString(pat) || LENGTH(pat) < 1) error(_("invalid '%s' argument"), "pattern"); if(LENGTH(pat) > 1) warning(_("argument '%s' has length > 1 and only the first element will be used"), "pattern"); if(!isString(vec)) error(_("invalid '%s' argument"), "x"); if(opt_icase) cflags |= REG_ICASE; n = XLENGTH(vec); if(!useBytes) { Rboolean haveBytes = IS_BYTES(STRING_ELT(pat, 0)); if(!haveBytes) for (i = 0; i < n; i++) if(IS_BYTES(STRING_ELT(vec, i))) { haveBytes = TRUE; break; } if(haveBytes) useBytes = TRUE; } if(!useBytes) { useWC = !IS_ASCII(STRING_ELT(pat, 0)); if(!useWC) { for (i = 0 ; i < n ; i++) { if(STRING_ELT(vec, i) == NA_STRING) continue; if(!IS_ASCII(STRING_ELT(vec, i))) { useWC = TRUE; break; } } } } if(STRING_ELT(pat, 0) == NA_STRING) { if(opt_value) { PROTECT(ans = allocVector(STRSXP, n)); for(i = 0; i < n; i++) SET_STRING_ELT(ans, i, NA_STRING); SEXP nms = getAttrib(vec, R_NamesSymbol); if(!isNull(nms)) setAttrib(ans, R_NamesSymbol, nms); } else { PROTECT(ans = allocVector(INTSXP, n)); for(i = 0; i < n; i++) INTEGER(ans)[i] = NA_INTEGER; } UNPROTECT(1); return ans; } SEXP s_nchar = install("nchar"); if(useBytes) PROTECT(call = lang3(s_nchar, pat, ScalarString(mkChar("bytes")))); else PROTECT(call = lang3(s_nchar, pat, ScalarString(mkChar("chars")))); patlen = asInteger(eval(call, env)); UNPROTECT(1); if(!patlen) error(_("'pattern' must be a non-empty character string")); /* wtransChar and translateChar can R_alloc */ vmax = vmaxget(); if(useBytes) rc = tre_regcompb(®, CHAR(STRING_ELT(pat, 0)), cflags); else if(useWC) rc = tre_regwcomp(®, wtransChar(STRING_ELT(pat, 0)), cflags); else { const char *spat = translateChar(STRING_ELT(pat, 0)); if(mbcslocale && !mbcsValid(spat)) error(_("regular expression is invalid in this locale")); rc = tre_regcomp(®, spat, cflags); } if(rc) { char errbuf[1001]; tre_regerror(rc, ®, errbuf, 1001); error(_("regcomp error: '%s'"), errbuf); } tre_regaparams_default(¶ms); amatch_regaparams(¶ms, patlen, REAL(opt_bounds), INTEGER(opt_costs)); /* Matching. */ n = LENGTH(vec); PROTECT(ind = allocVector(LGLSXP, n)); nmatches = 0; for (i = 0 ; i < n ; i++) { // if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt(); if(STRING_ELT(vec, i) == NA_STRING) { LOGICAL(ind)[i] = 0; continue; } /* Perform match. */ /* undocumented, must be zeroed */ memset(&match, 0, sizeof(match)); if(useBytes) rc = tre_regaexecb(®, CHAR(STRING_ELT(vec, i)), &match, params, 0); else if(useWC) { rc = tre_regawexec(®, wtransChar(STRING_ELT(vec, i)), &match, params, 0); vmaxset(vmax); } else { const char *s = translateChar(STRING_ELT(vec, i)); if(mbcslocale && !mbcsValid(s)) error(_("input string %lld is invalid in this locale"), (long long)i+1); rc = tre_regaexec(®, s, &match, params, 0); vmaxset(vmax); } if(rc == REG_OK) { LOGICAL(ind)[i] = 1; nmatches++; } else LOGICAL(ind)[i] = 0; } tre_regfree(®); if (PRIMVAL(op)) {/* agrepl case */ UNPROTECT(1); return ind; } if(opt_value) { PROTECT(ans = allocVector(STRSXP, nmatches)); SEXP nmold = getAttrib(vec, R_NamesSymbol), nm; for (j = i = 0 ; i < n ; i++) { if(LOGICAL(ind)[i]) SET_STRING_ELT(ans, j++, STRING_ELT(vec, i)); } /* copy across names and subset */ if(!isNull(nmold)) { nm = allocVector(STRSXP, nmatches); for (i = 0, j = 0; i < n ; i++) if(LOGICAL(ind)[i]) SET_STRING_ELT(nm, j++, STRING_ELT(nmold, i)); setAttrib(ans, R_NamesSymbol, nm); } } #ifdef LONG_VECTOR_SUPPORT else if (n > INT_MAX) { PROTECT(ans = allocVector(REALSXP, nmatches)); for (j = i = 0 ; i < n ; i++) if(LOGICAL(ind)[i] == 1) REAL(ans)[j++] = (double)(i + 1); } #endif else { PROTECT(ans = allocVector(INTSXP, nmatches)); for (j = i = 0 ; i < n ; i++) if(LOGICAL(ind)[i] == 1) INTEGER(ans)[j++] = (int)(i + 1); } UNPROTECT(2); return ans; } #define ANS(I, J) REAL(ans)[I + J * nx] #define COUNTS(I, J, K) INTEGER(counts)[I + J * nx + K * nxy] #define MAT(X, I, J) X[I + (J) * nr] static SEXP adist_full(SEXP x, SEXP y, double *costs, Rboolean opt_counts) { SEXP ans, counts, trafos = R_NilValue /* -Wall */, dimnames, names; double cost_ins, cost_del, cost_sub; double *dists, d, d_ins, d_del, d_sub; char *paths = NULL, p, *buf = NULL; int i, j, k, l, m, nx, ny, nxy, *xi, *yj, nxi, nyj, nr, nc, nz; int nins, ndel, nsub, buflen = 100, need; counts = R_NilValue; /* -Wall */ nx = LENGTH(x); ny = LENGTH(y); nxy = nx * ny; cost_ins = costs[0]; cost_del = costs[1]; cost_sub = costs[2]; PROTECT(ans = allocMatrix(REALSXP, nx, ny)); if(opt_counts) { PROTECT(counts = alloc3DArray(INTSXP, nx, ny, 3)); PROTECT(trafos = allocMatrix(STRSXP, nx, ny)); buf = R_Calloc(buflen, char); } for(i = 0; i < nx; i++) { nxi = LENGTH(VECTOR_ELT(x, i)); xi = INTEGER(VECTOR_ELT(x, i)); if(nxi && (xi[0] == NA_INTEGER)) { for(j = 0; j < ny; j++) { ANS(i, j) = NA_REAL; } if(opt_counts) { for(m = 0; m < 3; m++) { COUNTS(i, j, m) = NA_INTEGER; } } } else { for(j = 0; j < ny; j++) { nyj = LENGTH(VECTOR_ELT(y, j)); yj = INTEGER(VECTOR_ELT(y, j)); if(nyj && (yj[0] == NA_INTEGER)) { ANS(i, j) = NA_REAL; if(opt_counts) { for(m = 0; m < 3; m++) { COUNTS(i, j, m) = NA_INTEGER; } } } else { /* Determine operation-weighted edit distance via * straightforward dynamic programming. */ nr = nxi + 1; nc = nyj + 1; dists = R_Calloc(nr * nc, double); MAT(dists, 0, 0) = 0; for(k = 1; k < nr; k++) MAT(dists, k, 0) = k * cost_del; for(l = 1; l < nc; l++) MAT(dists, 0, l) = l * cost_ins; if(opt_counts) { paths = R_Calloc(nr * nc, char); for(k = 1; k < nr; k++) MAT(paths, k, 0) = 'D'; for(l = 1; l < nc; l++) MAT(paths, 0, l) = 'I'; } for(k = 1; k < nr; k++) { for(l = 1; l < nc; l++) { if(xi[k - 1] == yj[l - 1]) { MAT(dists, k, l) = MAT(dists, k - 1, l - 1); if(opt_counts) MAT(paths, k, l) = 'M'; } else { d_ins = MAT(dists, k, l - 1) + cost_ins; d_del = MAT(dists, k - 1, l) + cost_del; d_sub = MAT(dists, k - 1, l - 1) + cost_sub; if(opt_counts) { if(d_ins <= d_del) { d = d_ins; p = 'I'; } else { d = d_del; p = 'D'; } if(d_sub < d) { d = d_sub; p = 'S'; } MAT(paths, k, l) = p; } else { d = fmin(fmin(d_ins, d_del), d_sub); } MAT(dists, k, l) = d; } } } ANS(i, j) = MAT(dists, nxi, nyj); if(opt_counts) { if(!R_finite(ANS(i, j))) { for(m = 0; m < 3; m++) { COUNTS(i, j, m) = NA_INTEGER; } SET_STRING_ELT(trafos, i + nx * j, NA_STRING); } else { nins = ndel = nsub = 0; k = nxi; l = nyj; m = k + l; nz = m; need = 2 * m + 1; if(buflen < need) { buf = R_Realloc(buf, need , char); buflen = need; } /* Need to read backwards and fill forwards. */ while((k > 0) || (l > 0)) { p = MAT(paths, k, l); if(p == 'I') { nins++; l--; } else if(p == 'D') { ndel++; k--; } else { if(p == 'S') nsub++; k--; l--; } buf[m] = p; m++; } /* Now reverse the transcript. */ for(k = 0, l = --m; l >= nz; k++, l--) buf[k] = buf[l]; buf[k] = '\0'; COUNTS(i, j, 0) = nins; COUNTS(i, j, 1) = ndel; COUNTS(i, j, 2) = nsub; SET_STRING_ELT(trafos, i + nx * j, mkChar(buf)); } R_Free(paths); } R_Free(dists); } } } } PROTECT(x = getAttrib(x, R_NamesSymbol)); PROTECT(y = getAttrib(y, R_NamesSymbol)); if(!isNull(x) || !isNull(y)) { PROTECT(dimnames = allocVector(VECSXP, 2)); SET_VECTOR_ELT(dimnames, 0, x); SET_VECTOR_ELT(dimnames, 1, y); setAttrib(ans, R_DimNamesSymbol, dimnames); UNPROTECT(1); /* dimnames */ } if(opt_counts) { R_Free(buf); PROTECT(dimnames = allocVector(VECSXP, 3)); PROTECT(names = allocVector(STRSXP, 3)); SET_STRING_ELT(names, 0, mkChar("ins")); SET_STRING_ELT(names, 1, mkChar("del")); SET_STRING_ELT(names, 2, mkChar("sub")); SET_VECTOR_ELT(dimnames, 0, x); SET_VECTOR_ELT(dimnames, 1, y); SET_VECTOR_ELT(dimnames, 2, names); setAttrib(counts, R_DimNamesSymbol, dimnames); setAttrib(ans, install("counts"), counts); UNPROTECT(2); /* names, dimnames */ if(!isNull(x) || !isNull(y)) { PROTECT(dimnames = allocVector(VECSXP, 2)); SET_VECTOR_ELT(dimnames, 0, x); SET_VECTOR_ELT(dimnames, 1, y); setAttrib(trafos, R_DimNamesSymbol, dimnames); UNPROTECT(1); /* dimnames */ } setAttrib(ans, install("trafos"), trafos); UNPROTECT(2); /* trafos, counts */ } UNPROTECT(3); /* y, x, ans */ return ans; } #define OFFSETS(I, J, K) INTEGER(offsets)[I + J * nx + K * nxy] attribute_hidden SEXP do_adist(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP x, y; SEXP ans, counts, offsets, dimnames, names, elt; SEXP opt_costs; int opt_fixed, opt_partial, opt_counts, opt_icase, useBytes; int i = 0, j = 0, m, nx, ny, nxy; const char *s, *t; const void *vmax = NULL; Rboolean haveBytes, useWC = FALSE; regex_t reg; regaparams_t params; regamatch_t match; size_t nmatch = 0 /* -Wall */; regmatch_t *pmatch = NULL; /* -Wall */ int rc, cflags = REG_EXTENDED; checkArity(op, args); x = CAR(args); args = CDR(args); y = CAR(args); args = CDR(args); opt_costs = CAR(args); args = CDR(args); opt_counts = asLogical(CAR(args)); args = CDR(args); opt_fixed = asInteger(CAR(args)); args = CDR(args); opt_partial = asInteger(CAR(args)); args = CDR(args); opt_icase = asLogical(CAR(args)); args = CDR(args); useBytes = asLogical(CAR(args)); if(opt_counts == NA_INTEGER) opt_counts = 0; if(opt_fixed == NA_INTEGER) opt_fixed = 1; if(opt_partial == NA_INTEGER) opt_partial = 0; if(opt_icase == NA_INTEGER) opt_icase = 0; if(useBytes == NA_INTEGER) useBytes = 0; if(opt_fixed) cflags |= REG_LITERAL; if(opt_icase) cflags |= REG_ICASE; if(!opt_fixed && !opt_partial) { warning(_("argument '%s' will be ignored"), "partial = FALSE"); } if(!opt_partial) return(adist_full(x, y, REAL(opt_costs), opt_counts)); counts = R_NilValue; /* -Wall */ offsets = R_NilValue; /* -Wall */ if(!opt_counts) cflags |= REG_NOSUB; nx = length(x); ny = length(y); nxy = nx * ny; if(!useBytes) { haveBytes = FALSE; for(i = 0; i < nx; i++) { if(IS_BYTES(STRING_ELT(x, i))) { haveBytes = TRUE; break; } } if(!haveBytes) { for(j = 0; j < ny; j++) { if(IS_BYTES(STRING_ELT(y, j))) { haveBytes = TRUE; break; } } } if(haveBytes) useBytes = TRUE; } if(!useBytes) { for(i = 0; i < nx; i++) { if(STRING_ELT(x, i) == NA_STRING) continue; if(!IS_ASCII(STRING_ELT(x, i))) { useWC = TRUE; break; } } if(!useWC) { for(j = 0; j < ny; j++) { if(STRING_ELT(y, j) == NA_STRING) continue; if(!IS_ASCII(STRING_ELT(y, j))) { useWC = TRUE; break; } } } } tre_regaparams_default(¶ms); params.max_cost = INT_MAX; params.cost_ins = INTEGER(opt_costs)[0];; params.cost_del = INTEGER(opt_costs)[1]; params.cost_subst = INTEGER(opt_costs)[2]; PROTECT(ans = allocMatrix(REALSXP, nx, ny)); if(opt_counts) { PROTECT(counts = alloc3DArray(INTSXP, nx, ny, 3)); PROTECT(offsets = alloc3DArray(INTSXP, nx, ny, 2)); } /* wtransChar and translateChar can R_alloc */ vmax = vmaxget(); for(i = 0; i < nx; i++) { elt = STRING_ELT(x, i); if(elt == NA_STRING) { for(j = 0; j < ny; j++) { ANS(i, j) = NA_REAL; if(opt_counts) { for(m = 0; m < 3; m++) { COUNTS(i, j, m) = NA_INTEGER; } OFFSETS(i, j, 0) = -1; OFFSETS(i, j, 1) = -1; } } } else { if(useBytes) rc = tre_regcompb(®, CHAR(elt), cflags); else if(useWC) { rc = tre_regwcomp(®, wtransChar(elt), cflags); vmaxset(vmax); } else { s = translateChar(elt); if(mbcslocale && !mbcsValid(s)) { error(_("input string x[%d] is invalid in this locale"), i + 1); } rc = tre_regcomp(®, s, cflags); vmaxset(vmax); } if(rc) { char errbuf[1001]; tre_regerror(rc, ®, errbuf, 1001); error(_("regcomp error: '%s'"), errbuf); } if(opt_counts) { nmatch = reg.re_nsub + 1; pmatch = (regmatch_t *) malloc(nmatch * sizeof(regmatch_t)); if (pmatch == NULL) error("allocation failure in adist"); } for(j = 0; j < ny; j++) { elt = STRING_ELT(y, j); if(elt == NA_STRING) { ANS(i, j) = NA_REAL; if(opt_counts) { for(m = 0; m < 3; m++) { COUNTS(i, j, m) = NA_INTEGER; } OFFSETS(i, j, 0) = -1; OFFSETS(i, j, 1) = -1; } } else { /* Perform match. */ /* undocumented, must be zeroed */ memset(&match, 0, sizeof(match)); if(opt_counts) { match.nmatch = nmatch; match.pmatch = pmatch; } if(useBytes) rc = tre_regaexecb(®, CHAR(elt), &match, params, 0); else if(useWC) { rc = tre_regawexec(®, wtransChar(elt), &match, params, 0); vmaxset(vmax); } else { t = translateChar(elt); if(mbcslocale && !mbcsValid(t)) { error(_("input string y[%d] is invalid in this locale"), j + 1); } rc = tre_regaexec(®, t, &match, params, 0); vmaxset(vmax); } if(rc == REG_OK) { ANS(i, j) = (double) match.cost; if(opt_counts) { COUNTS(i, j, 0) = match.num_ins; COUNTS(i, j, 1) = match.num_del; COUNTS(i, j, 2) = match.num_subst; OFFSETS(i, j, 0) = match.pmatch[0].rm_so + 1; OFFSETS(i, j, 1) = match.pmatch[0].rm_eo; } } else { /* Should maybe check for REG_NOMATCH? */ ANS(i, j) = R_PosInf; if(opt_counts) { for(m = 0; m < 3; m++) { COUNTS(i, j, m) = NA_INTEGER; } OFFSETS(i, j, 0) = -1; OFFSETS(i, j, 1) = -1; } } } } if(opt_counts) free(pmatch); tre_regfree(®); } } PROTECT(x = getAttrib(x, R_NamesSymbol)); PROTECT(y = getAttrib(y, R_NamesSymbol)); if(!isNull(x) || !isNull(y)) { PROTECT(dimnames = allocVector(VECSXP, 2)); SET_VECTOR_ELT(dimnames, 0, x); SET_VECTOR_ELT(dimnames, 1, y); setAttrib(ans, R_DimNamesSymbol, dimnames); UNPROTECT(1); /* dimnames */ } if(opt_counts) { PROTECT(dimnames = allocVector(VECSXP, 3)); PROTECT(names = allocVector(STRSXP, 3)); SET_STRING_ELT(names, 0, mkChar("ins")); SET_STRING_ELT(names, 1, mkChar("del")); SET_STRING_ELT(names, 2, mkChar("sub")); SET_VECTOR_ELT(dimnames, 0, x); SET_VECTOR_ELT(dimnames, 1, y); SET_VECTOR_ELT(dimnames, 2, names); setAttrib(counts, R_DimNamesSymbol, dimnames); setAttrib(ans, install("counts"), counts); UNPROTECT(2); /* names, dimnames */ PROTECT(dimnames = allocVector(VECSXP, 3)); PROTECT(names = allocVector(STRSXP, 2)); SET_STRING_ELT(names, 0, mkChar("first")); SET_STRING_ELT(names, 1, mkChar("last")); SET_VECTOR_ELT(dimnames, 0, x); SET_VECTOR_ELT(dimnames, 1, y); SET_VECTOR_ELT(dimnames, 2, names); setAttrib(offsets, R_DimNamesSymbol, dimnames); setAttrib(ans, install("offsets"), offsets); UNPROTECT(4); /* names, dimnames, counts, offsets */ } UNPROTECT(3); /* y, x, counts */ return ans; } attribute_hidden SEXP do_aregexec(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP pat, vec, ans, matchpos, matchlen; SEXP opt_bounds, opt_costs; int opt_icase, opt_fixed, useBytes; Rboolean haveBytes, useWC = FALSE; const char *s, *t; const void *vmax = NULL; regex_t reg; size_t nmatch; regmatch_t *pmatch; regaparams_t params; regamatch_t match; int so, patlen; int rc, cflags = REG_EXTENDED; checkArity(op, args); pat = CAR(args); args = CDR(args); vec = CAR(args); args = CDR(args); opt_bounds = CAR(args); args = CDR(args); opt_costs = CAR(args); args = CDR(args); opt_icase = asLogical(CAR(args)); args = CDR(args); opt_fixed = asLogical(CAR(args)); args = CDR(args); useBytes = asLogical(CAR(args)); if(opt_icase == NA_INTEGER) opt_icase = 0; if(opt_fixed == NA_INTEGER) opt_fixed = 0; if(useBytes == NA_INTEGER) useBytes = 0; if(opt_fixed && opt_icase) { warning(_("argument '%s' will be ignored"), "ignore.case = TRUE"); opt_icase = 0; } if(opt_fixed) cflags |= REG_LITERAL; if(opt_icase) cflags |= REG_ICASE; if(!isString(pat) || (length(pat) < 1) || (STRING_ELT(pat, 0) == NA_STRING)) error(_("invalid '%s' argument"), "pattern"); if(length(pat) > 1) warning(_("argument '%s' has length > 1 and only the first element will be used"), "pattern"); if(!isString(vec)) error(_("invalid '%s' argument"), "text"); R_xlen_t n = XLENGTH(vec); if(!useBytes) { haveBytes = IS_BYTES(STRING_ELT(pat, 0)); if(!haveBytes) for(R_xlen_t i = 0; i < n; i++) { if(IS_BYTES(STRING_ELT(vec, i))) { haveBytes = TRUE; break; } } if(haveBytes) useBytes = TRUE; } if(!useBytes) { useWC = !IS_ASCII(STRING_ELT(pat, 0)); if(!useWC) { for(R_xlen_t i = 0 ; i < n ; i++) { if(STRING_ELT(vec, i) == NA_STRING) continue; if(!IS_ASCII(STRING_ELT(vec, i))) { useWC = TRUE; break; } } } } SEXP s_nchar = install("nchar"); if(useBytes) PROTECT(call = lang3(s_nchar, pat, ScalarString(mkChar("bytes")))); else PROTECT(call = lang3(s_nchar, pat, ScalarString(mkChar("chars")))); patlen = asInteger(eval(call, env)); UNPROTECT(1); if(!patlen) error(_("'pattern' must be a non-empty character string")); if(useBytes) rc = tre_regcompb(®, CHAR(STRING_ELT(pat, 0)), cflags); else if(useWC) rc = tre_regwcomp(®, wtransChar(STRING_ELT(pat, 0)), cflags); else { s = translateChar(STRING_ELT(pat, 0)); if(mbcslocale && !mbcsValid(s)) error(_("regular expression is invalid in this locale")); rc = tre_regcomp(®, s, cflags); } if(rc) { char errbuf[1001]; tre_regerror(rc, ®, errbuf, 1001); error(_("regcomp error: '%s'"), errbuf); } nmatch = reg.re_nsub + 1; pmatch = (regmatch_t *) malloc(nmatch * sizeof(regmatch_t)); if(pmatch == NULL) error("allocation failure in aregexec"); tre_regaparams_default(¶ms); amatch_regaparams(¶ms, patlen, REAL(opt_bounds), INTEGER(opt_costs)); PROTECT(ans = allocVector(VECSXP, n)); for(R_xlen_t i = 0; i < n; i++) { // if ((i+1) % NINTERRUPT == 0) R_CheckUserInterrupt(); if(STRING_ELT(vec, i) == NA_STRING) { PROTECT(matchpos = ScalarInteger(NA_INTEGER)); SEXP s_match_length = install("match.length"); setAttrib(matchpos, s_match_length, ScalarInteger(NA_INTEGER)); SET_VECTOR_ELT(ans, i, matchpos); UNPROTECT(1); } else { vmax = vmaxget(); /* Perform match. */ memset(&match, 0, sizeof(match)); match.nmatch = nmatch; match.pmatch = pmatch; if(useBytes) rc = tre_regaexecb(®, CHAR(STRING_ELT(vec, i)), &match, params, 0); else if(useWC) { rc = tre_regawexec(®, wtransChar(STRING_ELT(vec, i)), &match, params, 0); vmaxset(vmax); } else { t = translateChar(STRING_ELT(vec, i)); if(mbcslocale && !mbcsValid(t)) error(_("input string %lld is invalid in this locale"), (long long)i + 1); rc = tre_regaexec(®, t, &match, params, 0); vmaxset(vmax); } if(rc == REG_OK) { PROTECT(matchpos = allocVector(INTSXP, nmatch)); PROTECT(matchlen = allocVector(INTSXP, nmatch)); for(R_xlen_t j = 0; j < match.nmatch; j++) { so = match.pmatch[j].rm_so; INTEGER(matchpos)[j] = so + 1; INTEGER(matchlen)[j] = match.pmatch[j].rm_eo - so; } setAttrib(matchpos, install("match.length"), matchlen); if(useBytes) setAttrib(matchpos, install("useBytes"), ScalarLogical(TRUE)); SET_VECTOR_ELT(ans, i, matchpos); UNPROTECT(2); } else { /* No match (or could there be an error?). */ /* Alternatively, could return nmatch -1 values. */ PROTECT(matchpos = ScalarInteger(-1)); PROTECT(matchlen = ScalarInteger(-1)); setAttrib(matchpos, install("match.length"), matchlen); SET_VECTOR_ELT(ans, i, matchpos); UNPROTECT(2); } } } free(pmatch); tre_regfree(®); UNPROTECT(1); return ans; }