# File src/library/stats/R/kmeans.R # Part of the R package, https://www.R-project.org # # Copyright (C) 1995-2017 The R Core Team # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # A copy of the GNU General Public License is available at # https://www.R-project.org/Licenses/ kmeans <- function(x, centers, iter.max = 10L, nstart = 1L, algorithm = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"), trace = FALSE) { .Mimax <- .Machine$integer.max do_one <- function(nmeth) { switch(nmeth, { # 1 : Hartigan-Wong isteps.Qtran <- as.integer(min(.Mimax, 50 * m)) iTran <- c(isteps.Qtran, integer(k)) # correct for k=0 Z <- .Fortran(C_kmns, x, m, p, centers = centers, as.integer(k), c1 = integer(m), c2 = integer(m), nc = integer(k), double(k), double(k), ncp=integer(k), D = double(m), iTran = iTran, live = integer(k), iter = iter.max, wss = double(k), ifault = as.integer(trace)) switch(Z$ifault, ## 1: stop("empty cluster: try a better set of initial centers", call. = FALSE), ## 2: Z$iter <- max(Z$iter, iter.max+1L), # -> and warn below ## 3: stop("number of cluster centres must lie between 1 and nrow(x)", call.=FALSE), ## 4: {new @ 2013-06-30; maybe better fix (in Fortran) ?} warning(gettextf("Quick-TRANSfer stage steps exceeded maximum (= %d)", isteps.Qtran), call.=FALSE) ) }, { # 2 : Lloyd-Forgy Z <- .C(C_kmeans_Lloyd, x, m, p, centers = centers, k, c1 = integer(m), iter = iter.max, nc = integer(k), wss = double(k)) }, { # 3 : MacQueen Z <- .C(C_kmeans_MacQueen, x, m, p, centers = as.double(centers), k, c1 = integer(m), iter = iter.max, nc = integer(k), wss = double(k)) }) if(m23 <- any(nmeth == c(2L, 3L))) { if(any(Z$nc == 0)) warning("empty cluster: try a better set of initial centers", call. = FALSE) } if(Z$iter > iter.max) { warning(sprintf(ngettext(iter.max, "did not converge in %d iteration", "did not converge in %d iterations"), iter.max), call. = FALSE, domain = NA) if(m23) Z$ifault <- 2L } if(nmeth %in% c(2L, 3L)) { if(any(Z$nc == 0)) warning("empty cluster: try a better set of initial centers", call. = FALSE) } Z } x <- as.matrix(x) ## as.integer() gives NA ==> not allowing too large nrow() / ncol(): m <- as.integer(nrow(x)); if(is.na(m)) stop("invalid nrow(x)") p <- as.integer(ncol(x)); if(is.na(p)) stop("invalid ncol(x)") if(missing(centers)) stop("'centers' must be a number or a matrix") nmeth <- switch(match.arg(algorithm), "Hartigan-Wong" = 1L, "Lloyd" = 2L, "Forgy" = 2L, "MacQueen" = 3L) storage.mode(x) <- "double" if(length(centers) == 1L) { k <- centers ## we need to avoid duplicates here if(nstart == 1L) centers <- x[sample.int(m, k), , drop = FALSE] if(nstart >= 2L || any(duplicated(centers))) { cn <- unique(x) mm <- nrow(cn) if(mm < k) stop("more cluster centers than distinct data points.") centers <- cn[sample.int(mm, k), , drop=FALSE] } } else { centers <- as.matrix(centers) if(any(duplicated(centers))) stop("initial centers are not distinct") cn <- NULL k <- nrow(centers) if(m < k) stop("more cluster centers than data points") } k <- as.integer(k) if(is.na(k)) stop(gettextf("invalid value of %s", "'k'"), domain = NA) if (k == 1L) nmeth <- 3L # Hartigan-Wong, (Fortran) needs k > 1 iter.max <- as.integer(iter.max) if(is.na(iter.max) || iter.max < 1L) stop("'iter.max' must be positive") if(ncol(x) != ncol(centers)) stop("must have same number of columns in 'x' and 'centers'") storage.mode(centers) <- "double" Z <- do_one(nmeth) best <- sum(Z$wss) if(nstart >= 2L && !is.null(cn)) for(i in 2:nstart) { centers <- cn[sample.int(mm, k), , drop=FALSE] ZZ <- do_one(nmeth) if((z <- sum(ZZ$wss)) < best) { Z <- ZZ best <- z } } centers <- matrix(Z$centers, k) dimnames(centers) <- list(1L:k, dimnames(x)[[2L]]) cluster <- Z$c1 if(!is.null(rn <- rownames(x))) names(cluster) <- rn totss <- sum(scale(x, scale = FALSE)^2) structure(list(cluster = cluster, centers = centers, totss = totss, withinss = Z$wss, tot.withinss = best, betweenss = totss - best, size = Z$nc, iter = Z$iter, ifault = Z$ifault), class = "kmeans") } ## modelled on print methods in the cluster package print.kmeans <- function(x, ...) { cat("K-means clustering with ", length(x$size), " clusters of sizes ", paste(x$size, collapse = ", "), "\n", sep = "") cat("\nCluster means:\n") print(x$centers, ...) cat("\nClustering vector:\n") print(x$cluster, ...) cat("\nWithin cluster sum of squares by cluster:\n") print(x$withinss, ...) ratio <- sprintf(" (between_SS / total_SS = %5.1f %%)\n", 100 * x$betweenss/x$totss) cat(sub(".", getOption("OutDec"), ratio, fixed = TRUE), "Available components:\n", sep = "\n") print(names(x)) if(!is.null(x$ifault) && x$ifault == 2L) cat("Warning: did *not* converge in specified number of iterations\n") invisible(x) } fitted.kmeans <- function(object, method = c("centers", "classes"), ...) { method <- match.arg(method) if (method == "centers") object$centers[object$cluster, , drop = FALSE] else object$cluster }