# File src/library/utils/R/aspell.R # Part of the R package, https://www.R-project.org # # Copyright (C) 1995-2024 The R Core Team # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # A copy of the GNU General Public License is available at # https://www.R-project.org/Licenses/ aspell <- function(files, filter, control = list(), encoding = "unknown", program = NULL, dictionaries = character()) { ## Take the given files and feed them through spell checker in ## Ispell pipe mode. ## Think about options and more command line options eventually. program <- aspell_find_program(program) if(is.na(program)) stop("No suitable spell-checker program found") ## Be nice. if(inherits(files, "Rd")) files <- list(files) files_are_names <- is.character(files) filter_args <- list() if(missing(filter) || is.null(filter)) { filter <- if(!files_are_names) { function(ifile, encoding) { if(inherits(ifile, "srcfile")) readLines(ifile$filename, encoding = encoding, warn = FALSE) else if(inherits(ifile, "connection")) readLines(ifile, encoding = encoding, warn = FALSE) else { ## What should this do with encodings? as.character(ifile) } } } else NULL } else if(is.character(filter)) { ## Look up filter in aspell filter db. filter_name <- filter[1L] filter <- aspell_filter_db[[filter_name]] ## Warn if the filter was not found in the db. if(is.null(filter)) warning(gettextf("Filter '%s' is not available.", filter_name), domain = NA) } else if(is.list(filter)) { ## Support ## list("Rd", drop = "\\references" ## at least for now. filter_name <- filter[[1L]][1L] filter_args <- filter[-1L] filter <- aspell_filter_db[[filter_name]] ## Warn if the filter was not found in the db. if(is.null(filter)) warning(gettextf("Filter '%s' is not available.", filter_name), domain = NA) } else if(!is.function(filter)) stop("Invalid 'filter' argument.") encoding <- rep_len(encoding, length(files)) verbose <- getOption("verbose") db <- data.frame(Original = character(), File = character(), Line = integer(), Column = integer(), stringsAsFactors = FALSE) db$Suggestions <- list() tfile <- tempfile("aspell") on.exit(unlink(tfile)) if(length(dictionaries)) { paths <- aspell_find_dictionaries(dictionaries) ind <- paths == "" if(any(ind)) { warning(gettextf("The following dictionaries were not found:\n%s", paste(sprintf(" %s", dictionaries[ind]), collapse = "\n")), domain = NA) paths <- paths[!ind] } if(length(paths)) { words <- unlist(lapply(paths, readRDS), use.names = FALSE) personal <- tempfile("aspell_personal") on.exit(unlink(personal), add = TRUE) ## ## How can we get the right language set (if needed)? ## Maybe aspell() needs an additional 'language' arg? aspell_write_personal_dictionary_file(words, personal, program = program) ## control <- c(control, "-p", shQuote(personal)) } } ## No special expansion of control argument for now. control <- as.character(control) fnames <- names(files) files <- as.list(files) for (i in seq_along(files)) { file <- files[[i]] if(files_are_names) fname <- file else { ## Try srcfiles and srcrefs ... fname <- if(inherits(file, "srcfile")) file$filename else attr(attr(file, "srcref"), "srcfile")$filename ## As a last resort, try the names of the files argument. if(is.null(fname)) fname <- fnames[i] ## If unknown ... if(is.null(fname)) fname <- "" } enc <- encoding[i] if(verbose) message(gettextf("Processing file %s", fname), domain = NA) lines <- if(is.null(filter)) readLines(file, encoding = enc, warn = FALSE) else { ## Assume that filter takes an input file (and additional ## arguments) and return a character vector. do.call(filter, c(list(file, encoding = enc), filter_args)) } ## Allow filters to pass additional control arguments, in case ## these need to be inferred from the file contents. control <- c(control, attr(lines, "control")) ## Need to escape all lines with carets to ensure Aspell handles ## them as data: the Aspell docs say ## It is recommended that programmatic interfaces prefix every ## data line with an uparrow to protect themselves against ## future changes in Aspell. writeLines(paste0("^", lines), tfile) ## Note that this re-encodes character strings with marked ## encodings to the current encoding (which is definitely fine ## if this is UTF-8 and Aspell was compiled with full UTF-8 ## support). Alternatively, we could try using something along ## the lines of ## writeLines(paste0("^", lines), tfile, ## useBytes = TRUE) ## and pass the encoding info to Aspell in case we know it. out <- tools:::.system_with_capture(program, c("-a", control), stdin = tfile) if(out$status != 0L) stop(gettextf("Running aspell failed with diagnostics:\n%s", paste(out$stderr, collapse = "\n")), domain = NA) ## Hopefully everything worked ok. lines <- out$stdout[-1L] pos <- cumsum(lines == "") + 1L ## Format is as follows. ## First line is a header. ## Blank lines separate the results for each line. ## Results for the word on each line are given as follows. ## * If the word was found in the main dictionary, or your personal ## dictionary, then the line contains only a `*'. ## * If the word is not in the dictionary, but there are ## suggestions, then the line contains an `&', a space, the ## misspelled word, a space, the number of near misses, the number ## of characters between the beginning of the line and the ## beginning of the misspelled word, a colon, another space, and a ## list of the suggestions separated by commas and spaces. ## * If the word does not appear in the dictionary, and there are no ## suggestions, then the line contains a `#', a space, the ## misspelled word, a space, and the character offset from the ## beginning of the line. ## This can be summarized as follows: ## OK: * ## Suggestions: & original count offset: miss, miss, ... ## None: # original offset ## Look at words not in dictionary with suggestions. if(any(ind <- startsWith(lines, "&"))) { info <- strsplit(lines[ind], ": ", fixed = TRUE) one <- strsplit(sapply(info, `[`, 1L), " ", fixed = TRUE) two <- strsplit(sapply(info, `[`, 2L), ", ", fixed = TRUE) db1 <- list2DF(list(Original = vapply(one, `[`, "", 2L), File = rep_len(fname, length(one)), Line = pos[ind], Column = as.integer(vapply(one, `[`, "", 4L)), Suggestions = two)) db <- rbind(db, db1) } ## Looks at words not in dictionary with no suggestions. if(any(ind <- startsWith(lines, "#"))) { one <- strsplit(lines[ind], " ", fixed = TRUE) db1 <- list2DF(list(Original = vapply(one, `[`, "", 2L), File = rep_len(fname, length(one)), Line = pos[ind], Column = as.integer(vapply(one, `[`, "", 3L)), Suggestions = vector("list", length(one)))) db <- rbind(db, db1) } } class(db) <- c("aspell", "data.frame") db } format.aspell <- function(x, sort = TRUE, verbose = FALSE, indent = 2L, ...) { if(!nrow(x)) return(character()) if(sort) x <- x[order(x$Original, x$File, x$Line, x$Column), ] from <- split(sprintf("%s:%d:%d", x$File, x$Line, x$Column), x$Original) if(verbose) { unlist(Map(function(w, f, s) { sprintf("Word: %s\nFrom: %s\n%s", w, paste0(c("", rep.int(" ", length(f) - 1L)), f, collapse = "\n"), paste(strwrap(paste("Suggestions:", paste(s[[1L]], collapse = " ")), exdent = 6L, indent = 0L), collapse = "\n")) }, names(from), from, split(x$Suggestions, x$Original))) } else { sep <- sprintf("\n%s", strrep(" ", indent)) paste(names(from), vapply(from, paste, "", collapse = sep), sep = sep) } } print.aspell <- function(x, ...) { if(nrow(x)) writeLines(paste(format(x, ...), collapse = "\n\n")) invisible(x) } summary.aspell <- function(object, ...) { words <- sort(unique(object$Original)) if(length(words)) { writeLines("Possibly misspelled words:") print(words) } invisible(words) } aspell_filter_db <- new.env(hash = FALSE) # small aspell_filter_db$Rd <- function(ifile, encoding = "unknown", drop = character(), keep = character(), macros = file.path(R.home("share"), "Rd", "macros", "system.Rd"), ignore = character()) { lines <- tools::RdTextFilter(ifile, encoding, drop = drop, keep = keep, macros = macros) blank_out_ignores_in_lines(lines, ignore) } aspell_filter_db$Sweave <- tools::SweaveTeXFilter aspell_find_program <- function(program = NULL) { check <- !is.null(program) || !is.null(names(program)) if(is.null(program)) program <- getOption("aspell_program") if(is.null(program)) program <- c("aspell", "hunspell", "ispell") program <- Filter(nzchar, Sys.which(program))[1L] if(!is.na(program) && check) { out <- c(system(sprintf("%s -v", program), intern = TRUE), "")[1L] if(grepl("really Aspell", out)) names(program) <- "aspell" else if(grepl("really Hunspell", out)) names(program) <- "hunspell" else if(grepl("International Ispell", out)) names(program) <- "ispell" else names(program) <- NA_character_ } program } aspell_dictionaries_R <- "en_stats" aspell_find_dictionaries <- function(dictionaries, dirnames = character()) { dictionaries <- as.character(dictionaries) if(!(n <- length(dictionaries))) return(character()) ## Always search the R system dictionary directory first. dirnames <- c(file.path(R.home("share"), "dictionaries"), dirnames) ## For now, all dictionary files should be .rds files. if(any(ind <- !endsWith(dictionaries, ".rds"))) dictionaries[ind] <- sprintf("%s.rds", dictionaries[ind]) out <- character(n) ## Dictionaries with no path separators are looked for in the given ## dictionary directories (by default, the R system dictionary ## directory). ind <- grepl(.Platform$file.sep, dictionaries, fixed = TRUE) ## (Equivalently, could check where paths == basename(paths).) if(length(pos <- which(ind))) { pos <- pos[file_test("-f", dictionaries[pos])] out[pos] <- normalizePath(dictionaries[pos], "/") } if(length(pos <- which(!ind))) { out[pos] <- find_files_in_directories(dictionaries[pos], dirnames) } out } ### Utilities. aspell_inspect_context <- function(x) { x <- split(x, x$File) y <- Map(function(f, x) { lines <- readLines(f, warn = FALSE)[x$Line] cbind(f, x$Line, x$Column, substring(lines, 1L, x$Column - 1L), x$Original, substring(lines, x$Column + nchar(x$Original))) }, names(x), x) y <- data.frame(do.call(rbind, y), stringsAsFactors = FALSE) if(!length(y)) y <- list2DF(rep.int(list(character()), 6L)) names(y) <- c("File", "Line", "Column", "Left", "Original", "Right") class(y) <- c("aspell_inspect_context", "data.frame") y } format.aspell_inspect_context <- function(x, ..., byfile = FALSE, indent = 2L) { if(!nrow(x)) return(character()) chunks <- if(byfile) { chunks <- split(x, x$File) Map(function(u, e) c(sprintf("File '%s':", u), sprintf(" Line %s: \"%s\", \"%s\", \"%s\"", format(e$Line), gsub("\"", "\\\"", e$Left ), e$Original, gsub("\"", "\\\"", e$Right)), ""), names(chunks), chunks) } else { p <- strrep(" ", indent) y <- sprintf("%s%s:%s:%s\n%s%s%s%s\n%s%s%s", p, x$File, x$Line, x$Column, p, x$Left, x$Original, x$Right, p, strrep(" ", as.integer(x$Column) - 1L), strrep("^", nchar(x$Original))) chunks <- split(y, x$Original) Map(function(u, v) paste(c(paste("Word:", u), v), collapse = "\n"), names(chunks), chunks) } unlist(chunks, use.names = FALSE) } print.aspell_inspect_context <- function(x, ..., byfile = FALSE) { writeLines(format(x, ..., byfile = byfile)) invisible(x) } ## For spell-checking the R manuals: ## This can really only be done with Aspell as the other checkers have ## no texinfo mode. aspell_control_R_manuals <- list(aspell = c("--master=en_US", "--add-extra-dicts=en_GB", "--mode=texinfo", "--add-texinfo-ignore=I", "--add-texinfo-ignore=abbr", "--add-texinfo-ignore=acronym", "--add-texinfo-ignore=anchor", "--add-texinfo-ignore=deftypefun", "--add-texinfo-ignore=deftypefunx", "--add-texinfo-ignore=dfn", "--add-texinfo-ignore=findex", "--add-texinfo-ignore=enindex", "--add-texinfo-ignore=include", "--add-texinfo-ignore=ifclear", "--add-texinfo-ignore=ifset", "--add-texinfo-ignore=image", "--add-texinfo-ignore=key", "--add-texinfo-ignore=math", "--add-texinfo-ignore=multitable", "--add-texinfo-ignore=node", "--add-texinfo-ignore=printindex", "--add-texinfo-ignore=set", "--add-texinfo-ignore=value", "--add-texinfo-ignore=vindex", "--add-texinfo-ignore-env=direntry", "--add-texinfo-ignore-env=html", "--add-texinfo-ignore-env=macro", "--add-texinfo-ignore-env=menu", "--add-texinfo-ignore-env=tex", "--add-texinfo-ignore=CRANpkg", "--add-texinfo-ignore=cputype", "--add-texinfo-ignore=deqn", "--add-texinfo-ignore=eqn", "--add-texinfo-ignore=pkg", character() ), hunspell = c("-d en_US,en_GB")) aspell_R_manuals <- function(which = NULL, dir = NULL, program = NULL, dictionaries = c(aspell_dictionaries_R, "R_manuals")) { if(is.null(dir)) dir <- tools:::.R_top_srcdir_from_Rd() ## Allow specifying 'R-exts' and alikes, or full paths. files <- if(is.null(which)) { Sys.glob(file.path(dir, "doc", "manual", "*.texi")) } else { ind <- which(which == basename(tools::file_path_sans_ext(which))) which[ind] <- file.path(dir, "doc", "manual", sprintf("%s.texi", which[ind])) which } program <- aspell_find_program(program) aspell(files, control = aspell_control_R_manuals[[names(program)]], program = program, dictionaries = dictionaries) } ## For spell-checking the R Rd files: aspell_control_R_Rd_files <- list(aspell = c("--master=en_US", "--add-extra-dicts=en_GB"), hunspell = c("-d en_US,en_GB")) aspell_R_Rd_files <- function(which = NULL, dir = NULL, drop = c("\\abbr", "\\acronym", "\\author", "\\references"), program = NULL, dictionaries = c(aspell_dictionaries_R, "R_Rd_files")) { files <- character() if(is.null(dir)) dir <- tools:::.R_top_srcdir_from_Rd() if(is.null(which)) { which <- tools:::.get_standard_package_names()$base # CHANGES.Rd could be dropped from checks in the future; # it will not be updated post 2.15.0 files <- c(file.path(dir, "doc", "NEWS.Rd"), file.path(dir, "src", "gnuwin32", "CHANGES.Rd")) files <- files[file_test("-f", files)] } files <- c(files, unlist(lapply(file.path(dir, "src", "library", which, "man"), tools::list_files_with_type, "docs", OS_subdirs = c("unix", "windows")), use.names = FALSE)) ignore <- c(sprintf("\\b(%s)\\b", paste(c("a priori", "et seq", "post-mortem", "Inter alia", "inter alia", "2nd", "4th", "20th", "100th", "equi-", "intra-", "mis-", "Pre-", "pre-", "un-", "-ary", "-ness"), collapse = "|")), sprintf("(%s)\\b", paste(c("\$De\$", "\$de\$", "\$Un\$", "\$un\$", ## A literal 'nth' would even be in ## Wiktionary ## (), but ## we typically write \eqn{n}-th which after ## Rd filtering leaves '-th' by itself ... "-th", "'th"), collapse = "|"))) program <- aspell_find_program(program) aspell(files, filter = list("Rd", drop = drop, ignore = ignore), control = aspell_control_R_Rd_files[[names(program)]], program = program, dictionaries = dictionaries) } ## For spell-checking Rd files in a package: aspell_package_Rd_files <- function(dir, drop = c("\\abbr", "\\acronym", "\\author", "\\references"), control = list(), program = NULL, dictionaries = character()) { dir <- normalizePath(dir, "/") subdir <- file.path(dir, "man") files <- if(dir.exists(subdir)) tools::list_files_with_type(subdir, "docs", OS_subdirs = c("unix", "windows")) else character() meta <- tools:::.get_package_metadata(dir, installed = FALSE) if(is.na(encoding <- meta["Encoding"])) encoding <- "unknown" defaults <- .aspell_package_defaults(dir, encoding)$Rd_files if(!is.null(defaults)) { ## Direct settings currently override (could add a list add = ## TRUE mechanism eventually). if(!is.null(d <- defaults$drop)) drop <- d if(!is.null(d <- defaults$control)) control <- d if(!is.null(d <- defaults$program)) program <- d if(!is.null(d <- defaults$dictionaries)) { dictionaries <- aspell_find_dictionaries(d, file.path(dir, ".aspell")) } ## ## Deprecated in favor of specifying R level dictionaries. ## Maybe give a warning (in particular if both are given)? if(!is.null(d <- defaults$personal)) control <- c(control, sprintf("-p %s", shQuote(file.path(dir, ".aspell", d)))) ## } macros <- tools::loadPkgRdMacros(dir, macros = file.path(R.home("share"), "Rd", "macros", "system.Rd")) aspell(files, filter = list("Rd", drop = drop, macros = macros), control = control, encoding = encoding, program = program, dictionaries = dictionaries) } ## For spell-checking the R vignettes: ## This should really be done with Aspell as the other checkers have far ## less powerful TeX modes. aspell_control_R_vignettes <- list(aspell = c("--mode=tex", "--master=en_US", "--add-extra-dicts=en_GB", "--add-tex-command='I p'", "--add-tex-command='abbr p'", "--add-tex-command='author p'", "--add-tex-command='bibliographystyle p'", "--add-tex-command='citep p'", "--add-tex-command='citet p'", "--add-tex-command='code p'", "--add-tex-command='command p'", "--add-tex-command='definecolor ppp'", "--add-tex-command='file p'", "--add-tex-command='lstset p'", "--add-tex-command='lstinputlisting p'", "--add-tex-command='pkg p'", "--add-tex-command='CRANpkg p'" ), hunspell = c("-t", "-d en_US,en_GB")) aspell_R_vignettes <- function(program = NULL, dictionaries = c(aspell_dictionaries_R, "R_vignettes")) { files <- Sys.glob(file.path(tools:::.R_top_srcdir_from_Rd(), "src", "library", "*", "vignettes", "*.Rnw")) program <- aspell_find_program(program) aspell(files, filter = list("Sweave+LaTeX", cmds = c("Sexpr p", "SweaveOpts p", "code p", "hypersetup p")), control = aspell_control_R_vignettes[[names(program)]], program = program, dictionaries = dictionaries) } ## For spell-checking vignettes in a package: ## This should really be done with Aspell as the other checkers have far ## less powerful TeX modes. aspell_control_package_vignettes <- list(aspell = c("--add-tex-command='citep oop'", "--add-tex-command='Sexpr p'", "--add-tex-command='code p'", "--add-tex-command='pkg p'", "--add-tex-command='proglang p'", "--add-tex-command='samp p'" )) aspell_package_vignettes <- function(dir, control = list(), program = NULL, dictionaries = character()) { dir <- tools::file_path_as_absolute(dir) vinfo <- tools::pkgVignettes(dir = dir) files <- vinfo$docs if(!length(files)) return(aspell(character())) ## We need the package encoding to read the defaults file ... meta <- tools:::.get_package_metadata(dir, installed = FALSE) if(is.na(encoding <- meta["Encoding"])) encoding <- "unknown" defaults <- .aspell_package_defaults(dir, encoding)$vignettes if(!is.null(defaults)) { if(!is.null(d <- defaults$control)) control <- d if(!is.null(d <- defaults$program)) program <- d if(!is.null(d <- defaults$dictionaries)) { dictionaries <- aspell_find_dictionaries(d, file.path(dir, ".aspell")) } ## ## Deprecated in favor of specifying R level dictionaries. ## Maybe give a warning (in particular if both are given)? if(!is.null(d <- defaults$personal)) control <- c(control, sprintf("-p %s", shQuote(file.path(dir, ".aspell", d)))) ## } program <- aspell_find_program(program) fgroups <- split(files, vinfo$engines) egroups <- split(vinfo$encodings, vinfo$engines) do.call(rbind, Map(function(fgroup, egroup, engine) { engine <- tools::vignetteEngine(engine) aspell(fgroup, filter = engine$aspell$filter, control = c(engine$aspell$control, aspell_control_package_vignettes[[names(program)]], control), encoding = egroup, program = program, dictionaries = dictionaries) }, fgroups, egroups, names(fgroups) ) ) } ## Spell-checking R files. aspell_filter_db$R <- function(ifile, encoding = "unknown", ignore = character()) { pd <- get_parse_data_for_message_strings(ifile, encoding) if(is.null(pd) || !NROW(pd)) return(character()) ## Strip the string delimiters. pd$text <- substring(pd$text, 2L, nchar(pd$text) - 1L) ## Replace whitespace C backslash escape sequences by whitespace. pd$text <- gsub("(^|[^\\])\\\\[fnrt]", "\\1 ", pd$text) pd$text <- gsub( "([^\\])\\\\[fnrt]", "\\1 ", pd$text) ## (Do this twice for now because in e.g. ## \n\t\tInformation on package %s ## the first \t is not matched the first time. Alternatively, we ## could match with ## (^|[^\\])((\\\\[fnrt])+) ## but then computing the replacement (\\1 plus as many blanks as ## the characters in \\2) is not straightforward. ## For gettextf() calls, replace basic percent escape sequences by ## whitespace. ind <- pd$caller == "gettextf" if(any(ind)) { pd$text[ind] <- gsub("(^|[^%])%[dioxXfeEgGaAs]", "\\1 ", pd$text[ind]) pd$text[ind] <- gsub(" ([^%])%[dioxXfeEgGaAs]", "\\1 ", pd$text[ind]) ## (See above for doing this twice.) } lines <- readLines(ifile, encoding = encoding, warn = FALSE) ## Column positions in the parse data have tabs expanded to tab ## stops using a tab width of 8, so for lines with tabs we need to ## map the column positions back to character positions. lines_in_pd <- sort(unique(c(pd$line1, pd$line2))) tab <- Map(function(tp, nc) { if(tp[1L] == -1L) return(NULL) widths <- rep.int(1, nc) for(i in tp) { cols <- cumsum(widths) widths[i] <- 8 - (cols[i] - 1) %% 8 } cumsum(widths) }, gregexpr("\t", lines[lines_in_pd], fixed = TRUE), nchar(lines[lines_in_pd])) names(tab) <- lines_in_pd lines[lines_in_pd] <- gsub("[^\t]", " ", lines[lines_in_pd]) lines[-lines_in_pd] <- "" for(entry in split(pd, seq_len(NROW(pd)))) { line1 <- entry$line1 line2 <- entry$line2 col1 <- entry$col1 col2 <- entry$col2 if(line1 == line2) { if(length(ptab <- tab[[as.character(line1)]])) { col1 <- which(ptab == col1) + 1L col2 <- which(ptab == col2) - 1L } substring(lines[line1], col1, col2) <- entry$text } else { texts <- unlist(strsplit(entry$text, "\n", fixed = TRUE)) n <- length(texts) if(length(ptab <- tab[[as.character(line1)]])) { col1 <- which(ptab == col1) + 1L } substring(lines[line1], col1) <- texts[1L] pos <- seq.int(from = 2L, length.out = n - 2L) if(length(pos)) lines[line1 + pos - 1] <- texts[pos] if(length(ptab <- tab[[as.character(line2)]])) { col2 <- which(ptab == col2) - 1L } substring(lines[line2], 1L, col2) <- texts[n] } } blank_out_ignores_in_lines(lines, ignore) } get_parse_data_for_message_strings <- function(file, encoding = "unknown") { ## The message strings considered are the string constants subject to ## translation in gettext-family calls (see below for details). exprs <- suppressWarnings(tools:::.parse_code_file(file = file, encoding = encoding, keep.source = TRUE)) if(!length(exprs)) return(NULL) pd <- getParseData(exprs) ## Function for computing grandparent ids. parents <- pd$parent names(parents) <- pd$id gpids <- function(ids) parents[as.character(parents[as.character(ids)])] ind <- (pd$token == "SYMBOL_FUNCTION_CALL") & !is.na(match(pd$text, c("warning", "stop", "message", "packageStartupMessage", "gettext", "gettextf", "ngettext"))) funs <- pd$text[ind] ids <- gpids(pd$id[ind]) calls <- getParseText(pd, ids) table <- pd[pd$token == "STR_CONST", ] ## Could have run into truncation ... table$text <- getParseText(table, table$id) pos <- match(gpids(table$id), ids) ind <- !is.na(pos) table <- split(table[ind, ], factor(pos[ind], seq_along(ids))) ## We have synopses ## message(..., domain = NULL, appendLF = TRUE) ## packageStartupMessage(..., domain = NULL, appendLF = TRUE) ## warning(..., call. = TRUE, immediate. = FALSE, domain = NULL) ## stop(..., call. = TRUE, domain = NULL) ## gettext(..., domain = NULL) ## ngettext(n, msg1, msg2, domain = NULL) ## gettextf(fmt, ..., domain = NULL) ## For the first five, we simply take all unnamed strings. ## (Could make this more precise, of course.) ## For the latter two, we take the msg1/msg2 and fmt arguments, ## provided these are strings. ## ## Using domain = NA inhibits translation: perhaps it should ## optionally also inhibit spell checking? ## extract_message_strings <- function(fun, call, table) { ## Matching a call containing ... gives ## Error in match.call(message, call) : ## ... used in a situation where it doesn't exist ## so eliminate these. ## (Note that we also drop "..." strings.) call <- str2lang(call) call <- call[ as.character(call) != "..." ] mc <- as.list(match.call(get(fun, envir = .BaseNamespaceEnv), call)) args <- if(fun == "gettextf") mc["fmt"] else if(fun == "ngettext") mc[c("msg1", "msg2")] else { if(!is.null(names(mc))) mc <- mc[!nzchar(names(mc))] mc[-1L] } strings <- as.character(args[vapply(args, is.character, TRUE)]) ## Need to canonicalize to match string constants before and ## after parsing ... texts <- vapply(str2expression(table$text), as.character, "") pos <- which(!is.na(match(texts, strings))) cbind(table[pos, ], caller = rep.int(fun, length(pos))) } do.call(rbind, Map(extract_message_strings, as.list(funs), as.list(calls), table)) } ## For spell-checking the R R files. aspell_R_R_files <- function(which = NULL, dir = NULL, ignore = c("[ \t]'[^']*'[ \t[:punct:]]", "[ \t][[:alnum:]_.]*\$\$[ \t[:punct:]]"), program = NULL, dictionaries = aspell_dictionaries_R) { if(is.null(dir)) dir <- tools:::.R_top_srcdir_from_Rd() if(is.null(which)) which <- tools:::.get_standard_package_names()$base files <- unlist(lapply(file.path(dir, "src", "library", which, "R"), tools::list_files_with_type, "code", OS_subdirs = c("unix", "windows")), use.names = FALSE) program <- aspell_find_program(program) aspell(files, filter = list("R", ignore = ignore), control = aspell_control_R_Rd_files[[names(program)]], program = program, dictionaries = dictionaries) } ## For spell-checking R files in a package. aspell_package_R_files <- function(dir, ignore = character(), control = list(), program = NULL, dictionaries = character()) { dir <- tools::file_path_as_absolute(dir) subdir <- file.path(dir, "R") files <- if(dir.exists(subdir)) tools::list_files_with_type(subdir, "code", OS_subdirs = c("unix", "windows")) else character() meta <- tools:::.get_package_metadata(dir, installed = FALSE) if(is.na(encoding <- meta["Encoding"])) encoding <- "unknown" defaults <- .aspell_package_defaults(dir, encoding)$R_files if(!is.null(defaults)) { if(!is.null(d <- defaults$ignore)) ignore <- d if(!is.null(d <- defaults$control)) control <- d if(!is.null(d <- defaults$program)) program <- d if(!is.null(d <- defaults$dictionaries)) { dictionaries <- aspell_find_dictionaries(d, file.path(dir, ".aspell")) } } program <- aspell_find_program(program) aspell(files, filter = list("R", ignore = ignore), control = control, encoding = encoding, program = program, dictionaries = dictionaries) } ## Spell-checking pot files. ## (Of course, directly analyzing the message strings would be more ## useful, but require writing appropriate text filters.) ## See also tools:::checkPoFile(). aspell_filter_db$pot <- function (ifile, encoding = "unknown", ignore = character()) { lines <- readLines(ifile, encoding = encoding, warn = FALSE) ind <- grepl("^msgid[ \t]", lines) do_entry <- function(s) { out <- character(length(s)) i <- 1L out[i] <- blank_out_regexp_matches(s[i], "^msgid[ \t]+\"") while(startsWith(s[i <- i + 1L], '"')) out[i] <- sub("^\"", " ", s[i]) if(grepl("^msgid_plural[ \t]", s[i])) { out[i] <- blank_out_regexp_matches(s[i], "^msgid_plural[ \t]+\"") while(startsWith(s[i <- i + 1L], '"')) out[i] <- sub("^\"", " ", s[i]) } out } entries <- split(lines, cumsum(ind)) lines <- c(character(length(entries[[1L]])), as.character(do.call(c, lapply(entries[-1L], do_entry)))) lines <- sub("\"[ \t]*$", " ", lines) ## ## Could replace backslash escapes for blanks and percent escapes by ## blanks, similar to what the R text filter does. ## blank_out_ignores_in_lines(lines, ignore) } ## For spell-checking all pot files in a package. aspell_package_pot_files <- function(dir, ignore = character(), control = list(), program = NULL, dictionaries = character()) { dir <- tools::file_path_as_absolute(dir) subdir <- file.path(dir, "po") files <- if(dir.exists(subdir)) Sys.glob(file.path(subdir, "*.pot")) else character() meta <- tools:::.get_package_metadata(dir, installed = FALSE) if(is.na(encoding <- meta["Encoding"])) encoding <- "unknown" program <- aspell_find_program(program) aspell(files, filter = list("pot", ignore = ignore), control = control, encoding = encoding, program = program, dictionaries = dictionaries) } ## For spell-checking the R C files. aspell_R_C_files <- function(which = NULL, dir = NULL, ignore = c("[ \t]'[[:alnum:]_.]*'[ \t[:punct:]]", "[ \t][[:alnum:]_.]*\$\$[ \t[:punct:]]"), program = NULL, dictionaries = aspell_dictionaries_R) { if(is.null(dir)) dir <- tools:::.R_top_srcdir_from_Rd() if(is.null(which)) which <- tools:::.get_standard_package_names()$base if(!is.na(pos <- match("base", which))) which[pos] <- "R" files <- sprintf("%s.pot", file.path(dir, "src", "library", which, "po", which)) files <- files[file_test("-f", files)] program <- aspell_find_program(program) aspell(files, filter = list("pot", ignore = ignore), control = aspell_control_R_Rd_files[[names(program)]], program = program, dictionaries = dictionaries) } ## For spell-checking package C files. aspell_package_C_files <- function(dir, ignore = character(), control = list(), program = NULL, dictionaries = character()) { dir <- tools::file_path_as_absolute(dir) ## Assume that the package C message template file is shipped as ## 'po/PACKAGE.pot'. files <- file.path(dir, "po", paste(basename(dir), "pot", collapse = ".")) files <- files[file_test("-f", files)] meta <- tools:::.get_package_metadata(dir, installed = FALSE) if(is.na(encoding <- meta["Encoding"])) encoding <- "unknown" defaults <- .aspell_package_defaults(dir, encoding)$C_files if(!is.null(defaults)) { if(!is.null(d <- defaults$ignore)) ignore <- d if(!is.null(d <- defaults$control)) control <- d if(!is.null(d <- defaults$program)) program <- d if(!is.null(d <- defaults$dictionaries)) { dictionaries <- aspell_find_dictionaries(d, file.path(dir, ".aspell")) } } program <- aspell_find_program(program) aspell(files, filter = list("pot", ignore = ignore), control = control, encoding = encoding, program = program, dictionaries = dictionaries) } ## Spell-checking DCF files. aspell_filter_db$dcf <- function(ifile, encoding, keep = c("Title", "Description"), ignore = character()) { lines <- readLines(ifile, encoding = encoding, warn = FALSE) line_has_tags <- grepl("^[^[:blank:]][^:]*:", lines) tags <- sub(":.*", "", lines[line_has_tags]) lines[line_has_tags] <- blank_out_regexp_matches(lines[line_has_tags], "^[^:]*:") lines <- split(lines, cumsum(line_has_tags)) ind <- is.na(match(tags, keep)) lines[ind] <- lapply(lines[ind], function(s) rep.int("", length(s))) ind <- !ind lines[ind] <- lapply(lines[ind], paste0, " ") lines <- unlist(lines, use.names = FALSE) blank_out_ignores_in_lines(lines, ignore) } ## For spell-checking package DESCRIPTION files. aspell_package_description <- function(dir, ignore = character(), control = list(), program = NULL, dictionaries = character()) { dir <- tools::file_path_as_absolute(dir) files <- file.path(dir, "DESCRIPTION") meta <- tools:::.get_package_metadata(dir, installed = FALSE) if(is.na(encoding <- meta["Encoding"])) encoding <- "unknown" ## Allow providing package defaults but make this controllable via ## _R_ASPELL_USE_DEFAULTS_FOR_PACKAGE_DESCRIPTION_ ## to safeguard against possible mis-use for CRAN incoming checks. defaults <- Sys.getenv("_R_ASPELL_USE_DEFAULTS_FOR_PACKAGE_DESCRIPTION_", "TRUE") defaults <- if(str2logical(defaults)) { .aspell_package_defaults(dir, encoding)$description } else NULL if(!is.null(defaults)) { if(!is.null(d <- defaults$ignore)) ignore <- d if(!is.null(d <- defaults$control)) control <- d if(!is.null(d <- defaults$program)) program <- d if(!is.null(d <- defaults$dictionaries)) { dictionaries <- aspell_find_dictionaries(d, file.path(dir, ".aspell")) } } program <- aspell_find_program(program) aspell(files, filter = list("dcf", ignore = ignore), control = control, encoding = encoding, program = program, dictionaries = dictionaries) } ## Spell-checking Markdown files. aspell_filter_db$md <- function(ifile, encoding = "UTF-8") { x <- readLines(ifile, encoding = encoding, warn = FALSE) n <- nchar(x) y <- strrep(rep.int(" ", length(x)), n) ## Determine positions of 'texts' along the lines of ## spelling::parse_text_md () by Jeroen Ooms. md <- commonmark::markdown_xml(x, extensions = TRUE, sourcepos = TRUE) doc <- xml2::xml_ns_strip(xml2::read_xml(md)) pos <- strsplit(xml2::xml_attr(xml2::xml_find_all(doc, "//text[@sourcepos]"), "sourcepos"), "[:-]") ## Now use the following idea. ## Each elt of pos now has positions for l1:c1 to l2:c2. ## If l1 < l2 ## Lines in (l1, l2) are taken as a whole ## Line l1 from c1 to nchar for l1 ## Line l2 from 1 to c1 ## otherwise ## Line l1 from c1 to c2. for(p in pos) { p <- as.integer(p) ## Legibility ... l1 <- p[1L]; c1 <- p[2L]; l2 <- p[3L]; c2 <- p[4L] if(l1 < l2) { substring(y[l1], c1, n[l1]) <- substring(x[l1], c1, n[l1]) if(l1 + 1L < l2) { w <- seq.int(from = l1 + 1L, to = l2 - 1L) y[w] <- x[w] } substring(y[l2], 1L, c2) <- substring(x[l2], 1L, c2) } else { substring(y[l1], c1, c2) <- substring(x[l1], c1, c2) } } y } ## Spell-checking LaTeX files. ## Aspell provides customizable filtering of command arguments, but has ## problems when arguments contain braces, and does not allow filtering ## verbatims or environments. aspell_filter_db$LaTeX <- function(ifile, encoding = "unknown", ...) aspell_filter_LaTeX_worker(readLines(ifile, encoding = encoding), ...) aspell_filter_LaTeX_worker <- function(x, vrbs = c("verbatim", "verbatim*", "Sinput", "Soutput"), cmds = NULL, envs = NULL) { ranges <- list() chrran <- function(e) getSrcref(e)[c(1L, 5L, 3L, 6L)] ltxtag <- function(e) { tag <- attr(e, "latex_tag") if(is.null(tag)) "NULL" else tag } if(length(cmds)) { cmds <- c(cmds, aspell_filter_LaTeX_commands) cmds <- strsplit(trimws(cmds), " +") ones <- vapply(cmds, `[[`, "", 1L) ## For now always ignore optional arguments. twos <- vapply(cmds, `[[`, "", 2L) cmds <- lapply(strsplit(gsub("[^pP]", "", twos), ""), function(e) which(e == "p")) names(cmds) <- paste0("\\", ones) } recurse <- function(e) { tag <- ltxtag(e) if((tag == "VERB") || ((tag == "ENVIRONMENT") && e[[1L]] %in% envs)) ranges <<- c(ranges, list(chrran(e))) else if(is.list(e)) { if(length(cmds)) { skip <- integer() tags <- vapply(e, ltxtag, "") ## Are there any macros listed in cmds? mpos <- which(tags == "MACRO") mpos <- mpos[vapply(e[mpos], `[[`, "", 1L) %in% names(cmds)] if(length(mpos)) { bpos <- which(tags == "BLOCK") for(m in mpos) { skip <- c(skip, bpos[bpos > m][cmds[[e[[m]][[1L]]]]]) } for(s in skip) { ran <- chrran(e[[s]]) ## Keep the braces. ran[2L] <- ran[2L] + 1L ran[4L] <- ran[4L] - 1L ranges <<- c(ranges, list(ran)) } e <- e[-skip] } } lapply(e, recurse) } } recurse(tools::parseLatex(x, verbatim = vrbs)) blank_out_character_ranges(x, ranges) } aspell_filter_LaTeX_commands_from_Aspell_tex_filter_info <- function(dir) { x <- readLines(file.path(dir, "modules/filter/tex-filter.info"), encoding = "UTF-8") ## Extract 'OPTION command' block. x <- x[seq.int(which(x == "OPTION command"), length(x))] x <- x[seq.int(1L, which(x == "ENDOPTION")[1L])] ## Extract command defaults. substring(x[startsWith(x, "DEFAULT")], 9L) } aspell_filter_LaTeX_commands <- c("addtocounter pp", "addtolength pp", "alpha p", "arabic p", "fnsymbol p", "roman p", "stepcounter p", "setcounter pp", "usecounter p", "value p", "newcounter po", "refstepcounter p", "label p", "pageref p", "ref p", "newcommand poOP", "renewcommand poOP", "newenvironment poOPP", "renewenvironment poOPP", "newtheorem poPo", "newfont pp", "documentclass op", "usepackage op", "begin po", "end p", "setlength pp", "addtolength pp", "settowidth pp", "settodepth pp", "settoheight pp", "enlargethispage p", "hyphenation p", "pagenumbering p", "pagestyle p", "addvspace p", "framebox ooP", "hspace p", "vspace p", "makebox ooP", "parbox ooopP", "raisebox pooP", "rule opp", "sbox pO", "savebox pooP", "usebox p", "include p", "includeonly p", "input p", "addcontentsline ppP", "addtocontents pP", "fontencoding p", "fontfamily p", "fontseries p", "fontshape p", "fontsize pp", "usefont pppp", "documentstyle op", "cite p", "nocite p", "psfig p", "selectlanguage p", "includegraphics op", "bibitem op", "geometry p") ## ## Try to merge into the Sweave filter. ## Note that currently we cannot pass filter args when using ## aspell_package_vignettes(). aspell_filter_db$`Sweave+LaTeX` <- function(ifile, encoding = "unknown", ...) aspell_filter_LaTeX_worker(tools::SweaveTeXFilter(ifile, encoding), ...) ## ## For spell checking packages. aspell_package <- function(dir, control = list(), program = NULL, dictionaries = character()) { args <- list(dir = dir, program = program, control = control, dictionaries = dictionaries) a <- rbind(do.call(aspell_package_description, args), do.call(aspell_package_Rd_files, args), do.call(aspell_package_vignettes, args), do.call(aspell_package_R_files, args), do.call(aspell_package_C_files, args)) if(nrow(a)) { a$File <- tools:::file_path_relative_to(a$File, dirname(dir), parent = FALSE) } a } ## For writing personal dictionaries: aspell_write_personal_dictionary_file <- function(x, out, language = "en", program = NULL) { if(inherits(x, "aspell")) x <- sort(unique(x$Original)) program <- aspell_find_program(program) if(is.na(program)) stop("No suitable spell check program found.") ## ## Ispell and Hunspell take simple word lists as personal dictionary ## files, but Aspell requires a special format, see e.g. ## http://aspell.net/man-html/Format-of-the-Personal-and-Replacement-Dictionaries.html ## and one has to create these by hand, as ## aspell --lang=en create personal ./foo "a b c" ## gives: Sorry "create/merge personal" is currently unimplemented. ## Encodings are a nightmare. ## Try to canonicalize to UTF-8 for Aspell (which allows recording ## the encoding in the personal dictionary). ## ## What should we do for Hunspell (which can handle UTF-8, but has ## no encoding information in the personal dictionary), or Ispell ## (which cannot handle UTF-8)? ## if(names(program) == "aspell") { header <- sprintf("personal_ws-1.1 %s %d UTF-8", language, length(x)) x <- enc2utf8(x) } else { header <- NULL } writeLines(c(header, x), out, useBytes = TRUE) } ## For reading package defaults: .aspell_package_defaults <- function(dir, encoding = "unknown") { dfile <- file.path(dir, ".aspell", "defaults.R") if(!file_test("-f", dfile)) return(NULL) exprs <- parse(dfile, encoding = encoding) envir <- new.env() for(e in exprs) eval(e, envir) as.list(envir) } ## Utilities. blank_out_regexp_matches <- function(s, re, ...) { m <- gregexpr(re, s, ...) regmatches(s, m) <- Map(function(n) strrep(" ", n), lapply(regmatches(s, m), nchar)) s } blank_out_ignores_in_lines <- function(lines, ignore) { args <- list() if(is.list(ignore)) { args <- ignore[-1L] ignore <- ignore[[1L]] } for(re in ignore[nzchar(ignore)]) lines <- do.call(blank_out_regexp_matches, c(list(lines, re), args)) lines } ## ## Should this also be used in the md filter? blank_out_character_ranges <- function(s, ranges) { for(r in ranges) { ## Legibility ... l1 <- r[1L]; c1 <- r[2L] l2 <- r[3L]; c2 <- r[4L] if(l1 == l2) { substring(s[l1], c1, c2) <- strrep(" ", c2 - c1 + 1L) } else { substring(s[l1], c1, nchar(s[l1])) <- "" for(i in seq(l1 + 1L, length.out = l2 - l1 - 1L)) s[i] <- "" substring(s[l2], 1L, c2) <- strrep(" ", c2) } } s } ## find_files_in_directories <- function(basenames, dirnames) { dirnames <- dirnames[dir.exists(dirnames)] dirnames <- normalizePath(dirnames, "/") out <- character(length(basenames)) pos <- seq_along(out) for(dir in dirnames) { paths <- file.path(dir, basenames[pos]) ind <- file_test("-f", paths) out[pos[ind]] <- paths[ind] pos <- pos[!ind] if(!length(pos)) break } out } aspell_query_wiktionary_categories <- function(x) { if(inherits(x, "aspell")) { x <- unique(x$Original) } verbose <- getOption("verbose") ## Need to split into chunks of size 50 if necessary: n <- length(x) k <- n %/% 50L ind <- c(rep.int(seq_len(k), rep.int(50L, k)), rep.int(k + 1L, n %% 50L)) y <- lapply(split(x, ind), function(s) { q <- URLencode(sprintf("https://en.wiktionary.org/w/api.php?action=query&prop=categories&format=json&cllimit=20&titles=%s", paste(s, collapse = "|"))) if(verbose) message(sprintf("Performing query %s", q)) u <- "" v <- list() repeat { w <- jsonlite::fromJSON(paste0(q, u)) v <- c(v, w$query$pages) if(is.null(u <- w$continue$clcontinue)) break u <- paste0("&clcontinue=", URLencode(u)) } ## Gather results. v <- do.call(rbind, lapply(v, function(e) list(e$title, e$categories$title))) lapply(split(v[, 2L], unlist(v[, 1L], use.names = FALSE)), unlist, use.names = FALSE) }) Reduce(c, y)[x] } aspell_update_dictionary <- function(dictionary, add = character()) { stopifnot(is.character(dictionary), length(dictionary) == 1L) ## Handle 'dictionary' the same way as the 'dictionaries' argument ## to aspell(): if there is no path separator take as an R system ## dictionary. if(!grepl(.Platform$file.sep, dictionary, fixed = TRUE)) { dictionary <- file.path(tools:::.R_top_srcdir_from_Rd(), "share", "dictionaries", dictionary) } txt <- paste0(dictionary, ".txt") rds <- paste0(dictionary, ".rds") new <- unique(c(if(file.exists(txt)) readLines(txt, encoding = "UTF-8"), enc2utf8(add))) new <- new[order(tolower(new), new)] new <- new[nzchar(new)] writeLines(new, txt, useBytes = TRUE) saveRDS(new, rds) }