# File src/library/stats/R/cor.test.R # Part of the R package, https://www.R-project.org # # Copyright (C) 1995-2022 The R Core Team # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # A copy of the GNU General Public License is available at # https://www.R-project.org/Licenses/ cor.test <- function(x, ...) UseMethod("cor.test") cor.test.default <- function(x, y, alternative = c("two.sided", "less", "greater"), method = c("pearson", "kendall", "spearman"), exact = NULL, conf.level = 0.95, continuity = FALSE, ...) { alternative <- match.arg(alternative) method <- match.arg(method) DNAME <- paste(deparse1(substitute(x)), "and", deparse1(substitute(y))) if(!is.numeric(x)) stop("'x' must be a numeric vector") if(!is.numeric(y)) stop("'y' must be a numeric vector") if(length(x) != length(y)) stop("'x' and 'y' must have the same length") OK <- complete.cases(x, y) x <- x[OK] y <- y[OK] n <- length(x) NVAL <- 0 conf.int <- FALSE if(method == "pearson") { if(n < 3L) stop("not enough finite observations") method <- "Pearson's product-moment correlation" names(NVAL) <- "correlation" r <- cor(x, y) df <- n - 2L ESTIMATE <- c(cor = r) PARAMETER <- c(df = df) STATISTIC <- c(t = sqrt(df) * r / sqrt(1 - r^2)) if(n > 3) { ## confidence int. if(!missing(conf.level) && (length(conf.level) != 1 || !is.finite(conf.level) || conf.level < 0 || conf.level > 1)) stop("'conf.level' must be a single number between 0 and 1") conf.int <- TRUE z <- atanh(r) sigma <- 1 / sqrt(n - 3) cint <- switch(alternative, less = c(-Inf, z + sigma * qnorm(conf.level)), greater = c(z - sigma * qnorm(conf.level), Inf), two.sided = z + c(-1, 1) * sigma * qnorm((1 + conf.level) / 2)) cint <- tanh(cint) attr(cint, "conf.level") <- conf.level } PVAL <- switch(alternative, "less" = pt(STATISTIC, df), "greater" = pt(STATISTIC, df, lower.tail=FALSE), "two.sided" = 2 * min(pt(STATISTIC, df), pt(STATISTIC, df, lower.tail=FALSE))) } else { if(n < 2) stop("not enough finite observations") PARAMETER <- NULL TIES <- (min(length(unique(x)), length(unique(y))) < n) if(method == "kendall") { method <- "Kendall's rank correlation tau" names(NVAL) <- "tau" r <- cor(x,y, method = "kendall") ESTIMATE <- c(tau = r) if(!is.finite(ESTIMATE)) { # all x or all y the same ESTIMATE[] <- NA STATISTIC <- c(T = NA) PVAL <- NA } else { if(is.null(exact)) exact <- (n < 50) if(exact && !TIES) { q <- round((r + 1) * n * (n - 1) / 4) STATISTIC <- c(T = q) pkendall <- function(q, n) .Call(C_pKendall, q, n) PVAL <- switch(alternative, "two.sided" = { if(q > n * (n - 1) / 4) p <- 1 - pkendall(q - 1, n) else p <- pkendall(q, n) min(2 * p, 1) }, "greater" = 1 - pkendall(q - 1, n), "less" = pkendall(q, n)) } else { xties <- table(x[duplicated(x)]) + 1 yties <- table(y[duplicated(y)]) + 1 T0 <- n * (n - 1)/2 T1 <- sum(xties * (xties - 1))/2 T2 <- sum(yties * (yties - 1))/2 S <- r * sqrt((T0 - T1) * (T0 - T2)) v0 <- n * (n - 1) * (2 * n + 5) vt <- sum(xties * (xties - 1) * (2 * xties + 5)) vu <- sum(yties * (yties - 1) * (2 * yties + 5)) v1 <- sum(xties * (xties - 1)) * sum(yties * (yties - 1)) v2 <- sum(xties * (xties - 1) * (xties - 2)) * sum(yties * (yties - 1) * (yties - 2)) var_S <- (v0 - vt - vu) / 18 + v1 / (2 * n * (n - 1)) + v2 / (9 * n * (n - 1) * (n - 2)) if(exact && TIES) warning("Cannot compute exact p-value with ties") if (continuity) S <- sign(S) * (abs(S) - 1) STATISTIC <- c(z = S / sqrt(var_S)) PVAL <- switch(alternative, "less" = pnorm(STATISTIC), "greater" = pnorm(STATISTIC, lower.tail=FALSE), "two.sided" = 2 * min(pnorm(STATISTIC), pnorm(STATISTIC, lower.tail=FALSE))) } } } else { method <- "Spearman's rank correlation rho" if (is.null(exact)) exact <- TRUE names(NVAL) <- "rho" r <- cor(rank(x), rank(y)) ESTIMATE <- c(rho = r) if(!is.finite(ESTIMATE)) { # all x or all y the same ESTIMATE[] <- NA STATISTIC <- c(S = NA) PVAL <- NA } else { ## Use the test statistic S = sum(rank(x) - rank(y))^2 ## and AS 89 for obtaining better p-values than via the ## simple normal approximation. ## In the case of no ties, S = (1-rho) * (n^3-n)/6. pspearman <- function(q, n, lower.tail = TRUE) { if(n <= 1290 && exact) # n*(n^2 - 1) does not overflow .Call(C_pRho, round(q) + 2*lower.tail, n, lower.tail) else { # for large n: asymptotic t_{n-2} den <- (n*(n^2-1))/6 # careful for overflow ## Kendall et all (1939) p. 260 if (continuity) den <- den + 1 r <- 1 - q/den pt(r / sqrt((1 - r^2)/(n-2)), df = n-2, lower.tail = !lower.tail) } } q <- (n^3 - n) * (1 - r) / 6 STATISTIC <- c(S = q) if(TIES && exact){ exact <- FALSE warning("Cannot compute exact p-value with ties") } PVAL <- switch(alternative, "two.sided" = { p <- if(q > (n^3 - n) / 6) pspearman(q, n, lower.tail = FALSE) else pspearman(q, n, lower.tail = TRUE) min(2 * p, 1) }, "greater" = pspearman(q, n, lower.tail = TRUE), "less" = pspearman(q, n, lower.tail = FALSE)) } } } RVAL <- list(statistic = STATISTIC, parameter = PARAMETER, p.value = as.numeric(PVAL), estimate = ESTIMATE, null.value = NVAL, alternative = alternative, method = method, data.name = DNAME) if(conf.int) RVAL <- c(RVAL, list(conf.int = cint)) class(RVAL) <- "htest" RVAL } cor.test.formula <- function(formula, data, subset, na.action, ...) { if(missing(formula) || !inherits(formula, "formula") || length(formula) != 2L) stop("'formula' missing or invalid") m <- match.call(expand.dots = FALSE) if(is.matrix(eval(m$data, parent.frame()))) m$data <- as.data.frame(data) ## need stats:: for non-standard evaluation m[[1L]] <- quote(stats::model.frame) m$... <- NULL mf <- eval(m, parent.frame()) if(length(mf) != 2L) stop("invalid formula") DNAME <- paste(names(mf), collapse = " and ") ## Call the default method. y <- cor.test(x = mf[[1L]], y = mf[[2L]], ...) y$data.name <- DNAME y }