# File src/library/stats/R/fisher.test.R # Part of the R package, https://www.R-project.org # # Copyright (C) 1995-2019 The R Core Team # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # A copy of the GNU General Public License is available at # https://www.R-project.org/Licenses/ fisher.test <- function(x, y = NULL, workspace = 200000, hybrid = FALSE, hybridPars = c(expect = 5, percent = 80, Emin = 1), control = list(), or = 1, alternative = "two.sided", conf.int = TRUE, conf.level = 0.95, simulate.p.value = FALSE, B = 2000) { DNAME <- deparse1(substitute(x)) METHOD <- "Fisher's Exact Test for Count Data" if(is.data.frame(x)) x <- as.matrix(x) if(is.matrix(x)) { if(any(dim(x) < 2L)) stop("'x' must have at least 2 rows and columns") if(!is.numeric(x) || any(x < 0) || anyNA(x)) stop("all entries of 'x' must be nonnegative and finite") if(!is.integer(x)) { xo <- x x <- round(x) if(any(x > .Machine$integer.max)) stop("'x' has entries too large to be integer") if(!identical(TRUE, (ax <- all.equal(xo, x)))) warning(gettextf("'x' has been rounded to integer: %s", ax), domain = NA) storage.mode(x) <- "integer" } } else { if(is.null(y)) stop("if 'x' is not a matrix, 'y' must be given") if(length(x) != length(y)) stop("'x' and 'y' must have the same length") DNAME <- paste(DNAME, "and", deparse1(substitute(y))) OK <- complete.cases(x, y) ## use as.factor rather than factor here to be consistent with ## pre-tabulated data x <- as.factor(x[OK]) y <- as.factor(y[OK]) if((nlevels(x) < 2L) || (nlevels(y) < 2L)) stop("'x' and 'y' must have at least 2 levels") x <- table(x, y) } ## x is integer con <- list(mult = 30) con[names(control)] <- control if((mult <- as.integer(con$mult)) < 2) stop("'mult' must be integer >= 2, typically = 30") nr <- nrow(x) nc <- ncol(x) have.2x2 <- (nr == 2) && (nc == 2) if(have.2x2) { alternative <- char.expand(alternative, c("two.sided", "less", "greater")) if(length(alternative) > 1L || is.na(alternative)) stop("alternative must be \"two.sided\", \"less\" or \"greater\"") if(!((length(conf.level) == 1L) && is.finite(conf.level) && (conf.level > 0) && (conf.level < 1))) stop("'conf.level' must be a single number between 0 and 1") if(!missing(or) && (length(or) > 1L || is.na(or) || or < 0)) stop("'or' must be a single number between 0 and Inf") } PVAL <- NULL if(!have.2x2) { if(simulate.p.value) { ## we drop all-zero rows and columns sr <- rowSums(x) sc <- colSums(x) x <- x[sr > 0, sc > 0, drop = FALSE] nr <- as.integer(nrow(x)) nc <- as.integer(ncol(x)) if (is.na(nr) || is.na(nc) || is.na(nr * nc)) stop("invalid nrow(x) or ncol(x)", domain = NA) if(nr <= 1L) stop("need 2 or more non-zero row marginals") if(nc <= 1L) stop("need 2 or more non-zero column marginals") METHOD <- paste(METHOD, "with simulated p-value\n\t (based on", B, "replicates)") STATISTIC <- -sum(lfactorial(x)) tmp <- .Call(C_Fisher_sim, rowSums(x), colSums(x), B) ## use correct significance level for a Monte Carlo test almost.1 <- 1 + 64 * .Machine$double.eps ## PR#10558: STATISTIC is negative PVAL <- (1 + sum(tmp <= STATISTIC/almost.1)) / (B + 1) } else if(hybrid) { ## Cochran condition for asym.chisq. decision is ## hybridPars = c(expect = 5, percent = 80, Emin = 1), if(!is.null(nhP <- names(hybridPars)) && !identical(nhP, c("expect", "percent", "Emin"))) stop("names(hybridPars) should be NULL or be identical to the default's") stopifnot(is.double(hypp <- as.double(hybridPars)), length(hypp) == 3L, hypp[1] > 0, hypp[3] >= 0, 0 <= hypp[2], hypp[2] <= 100) PVAL <- .Call(C_Fexact, x, hypp, workspace, mult) METHOD <- paste(METHOD, sprintf("hybrid using asym.chisq. iff (exp=%g, perc=%g, Emin=%g)", hypp[1], hypp[2], hypp[3])) } else { ## expect < 0 : exact PVAL <- .Call(C_Fexact, x, c(-1, 100, 0), workspace, mult) } RVAL <- list(p.value = max(0, min(1, PVAL))) } else { ## conf.int and more only in 2 x 2 case if(hybrid) warning("'hybrid' is ignored for a 2 x 2 table") m <- sum(x[, 1L]) n <- sum(x[, 2L]) k <- sum(x[1L, ]) x <- x[1L, 1L] lo <- max(0L, k - n) hi <- min(k, m) NVAL <- c("odds ratio" = or) ## Note that in general the conditional distribution of x given ## the marginals is a non-central hypergeometric distribution H ## with non-centrality parameter ncp, the odds ratio. support <- lo : hi ## Density of the *central* hypergeometric distribution on its ## support: store for once as this is needed quite a bit. logdc <- dhyper(support, m, n, k, log = TRUE) dnhyper <- function(ncp) { ## Does not work for boundary values for ncp (0, Inf) but it ## does not need to. d <- logdc + log(ncp) * support d <- exp(d - max(d)) # beware of overflow d / sum(d) } mnhyper <- function(ncp) { if(ncp == 0) return(lo) if(ncp == Inf) return(hi) sum(support * dnhyper(ncp)) } pnhyper <- function(q, ncp = 1, upper.tail = FALSE) { if(ncp == 1) { return(if(upper.tail) phyper(x - 1, m, n, k, lower.tail = FALSE) else phyper(x, m, n, k)) } if(ncp == 0) { return(as.numeric(if(upper.tail) q <= lo else q >= lo)) } if(ncp == Inf) { return(as.numeric(if(upper.tail) q <= hi else q >= hi)) } ## else sum(dnhyper(ncp)[if(upper.tail) support >= q else support <= q]) } ## Determine the p-value (if still necessary). if(is.null(PVAL)) { PVAL <- switch(alternative, less = pnhyper(x, or), greater = pnhyper(x, or, upper.tail = TRUE), two.sided = { if(or == 0) as.numeric(x == lo) else if(or == Inf) as.numeric(x == hi) else { ## Note that we need a little fuzz. relErr <- 1 + 10 ^ (-7) d <- dnhyper(or) sum(d[d <= d[x - lo + 1] * relErr]) } }) RVAL <- list(p.value = PVAL) } ## Determine the MLE for ncp by solving E(X) = x, where the ## expectation is with respect to H. mle <- function(x) { if(x == lo) return(0) if(x == hi) return(Inf) mu <- mnhyper(1) if(mu > x) uniroot(function(t) mnhyper(t) - x, c(0, 1))$root else if(mu < x) 1 / uniroot(function(t) mnhyper(1/t) - x, c(.Machine$double.eps, 1))$root else 1 } ESTIMATE <- c("odds ratio" = mle(x)) if(conf.int) { ## Determine confidence intervals for the odds ratio. ncp.U <- function(x, alpha) { if(x == hi) return(Inf) p <- pnhyper(x, 1) if(p < alpha) uniroot(function(t) pnhyper(x, t) - alpha, c(0, 1))$root else if(p > alpha) 1 / uniroot(function(t) pnhyper(x, 1/t) - alpha, c(.Machine$double.eps, 1))$root else 1 } ncp.L <- function(x, alpha) { if(x == lo) return(0) p <- pnhyper(x, 1, upper.tail = TRUE) if(p > alpha) uniroot(function(t) pnhyper(x, t, upper.tail = TRUE) - alpha, c(0, 1))$root else if(p < alpha) 1 / uniroot(function(t) pnhyper(x, 1/t, upper.tail = TRUE) - alpha, c(.Machine$double.eps, 1))$root else 1 } CINT <- switch(alternative, less = c(0, ncp.U(x, 1 - conf.level)), greater = c(ncp.L(x, 1 - conf.level), Inf), two.sided = { alpha <- (1 - conf.level) / 2 c(ncp.L(x, alpha), ncp.U(x, alpha)) }) attr(CINT, "conf.level") <- conf.level } RVAL <- c(RVAL, list(conf.int = if(conf.int) CINT, estimate = ESTIMATE, null.value = NVAL)) } ## end (2 x 2) structure(c(RVAL, alternative = alternative, method = METHOD, data.name = DNAME), class = "htest") }