# File src/library/stats/R/ks.test.R # Part of the R package, http://www.R-project.org # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # A copy of the GNU General Public License is available at # http://www.r-project.org/Licenses/ ks.test <- function(x, y, ..., alternative = c("two.sided", "less", "greater"), exact = NULL) { pkolmogorov1x <- function(x, n) { ## Probability function for the one-sided one-sample Kolmogorov ## statistics, based on the formula of Birnbaum & Tingey (1951). if(x <= 0) return(0) if(x >= 1) return(1) j <- seq.int(from = 0, to = floor(n * (1 - x))) 1 - x * sum(exp(lchoose(n, j) + (n - j) * log(1 - x - j / n) + (j - 1) * log(x + j / n))) } alternative <- match.arg(alternative) DNAME <- deparse(substitute(x)) x <- x[!is.na(x)] n <- length(x) if(n < 1) stop("not enough 'x' data") PVAL <- NULL if(is.numeric(y)) { DNAME <- paste(DNAME, "and", deparse(substitute(y))) y <- y[!is.na(y)] n.x <- as.double(n) # to avoid integer overflow n.y <- length(y) if(n.y < 1) stop("not enough 'y' data") if(is.null(exact)) exact <- (n.x * n.y < 10000) METHOD <- "Two-sample Kolmogorov-Smirnov test" TIES <- FALSE n <- n.x * n.y / (n.x + n.y) w <- c(x, y) z <- cumsum(ifelse(order(w) <= n.x, 1 / n.x, - 1 / n.y)) if(length(unique(w)) < (n.x + n.y)) { warning("cannot compute correct p-values with ties") z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)] TIES <- TRUE } STATISTIC <- switch(alternative, "two.sided" = max(abs(z)), "greater" = max(z), "less" = - min(z)) nm_alternative <- switch(alternative, "two.sided" = "two-sided", "less" = "the CDF of x lies below that of y", "greater" = "the CDF of x lies above that of y") if(exact && (alternative == "two.sided") && !TIES) PVAL <- 1 - .C("psmirnov2x", p = as.double(STATISTIC), as.integer(n.x), as.integer(n.y), PACKAGE = "stats")$p } else { if(is.character(y)) y <- get(y, mode="function") if(mode(y) != "function") stop("'y' must be numeric or a string naming a valid function") if(is.null(exact)) exact <- (n < 100) METHOD <- "One-sample Kolmogorov-Smirnov test" TIES <- FALSE if(length(unique(x)) < n) { warning("cannot compute correct p-values with ties") TIES <- TRUE } x <- y(sort(x), ...) - (0 : (n-1)) / n STATISTIC <- switch(alternative, "two.sided" = max(c(x, 1/n - x)), "greater" = max(1/n - x), "less" = max(x)) if(exact && !TIES) { PVAL <- if(alternative == "two.sided") 1 - .C("pkolmogorov2x", p = as.double(STATISTIC), as.integer(n), PACKAGE = "stats")$p else 1 - pkolmogorov1x(STATISTIC, n) } nm_alternative <- switch(alternative, "two.sided" = "two-sided", "less" = "the CDF of x lies below the null hypothesis", "greater" = "the CDF of x lies above the null hypothesis") } names(STATISTIC) <- switch(alternative, "two.sided" = "D", "greater" = "D^+", "less" = "D^-") pkstwo <- function(x, tol = 1e-6) { ## Compute \sum_{-\infty}^\infty (-1)^k e^{-2k^2x^2} ## Not really needed at this generality for computing a single ## asymptotic p-value as below. if(is.numeric(x)) x <- as.vector(x) else stop("argument 'x' must be numeric") p <- rep(0, length(x)) p[is.na(x)] <- NA IND <- which(!is.na(x) & (x > 0)) if(length(IND) > 0) { p[IND] <- .C("pkstwo", as.integer(length(x[IND])), p = as.double(x[IND]), as.double(tol), PACKAGE = "stats")$p } return(p) } if(is.null(PVAL)) { ## ## Currently, p-values for the two-sided two-sample case are ## exact if n.x * n.y < 10000 (unless controlled explicitly). ## In all other cases, the asymptotic distribution is used ## directly. But: let m and n be the min and max of the sample ## sizes, respectively. Then, according to Kim and Jennrich ## (1973), if m < n / 10, we should use the ## * Kolmogorov approximation with c.c. -1/(2*n) if 1 < m < 80; ## * Smirnov approximation with c.c. 1/(2*sqrt(n)) if m >= 80. PVAL <- ifelse(alternative == "two.sided", 1 - pkstwo(sqrt(n) * STATISTIC), exp(- 2 * n * STATISTIC^2)) ## } RVAL <- list(statistic = STATISTIC, p.value = PVAL, alternative = nm_alternative, method = METHOD, data.name = DNAME) class(RVAL) <- "htest" return(RVAL) }