### $Id$ plot.partition <- function(x, ask = FALSE, which.plots = NULL, nmax.lab = 40, max.strlen = 5, data = x$data, dist = NULL, stand = FALSE, lines = 2, shade = FALSE, color = FALSE, labels = 0, plotchar = TRUE, span = TRUE, xlim = NULL, ylim = NULL, main = NULL, ...) { if(is.null(x$data))# data not kept x$data <- data if(is.null(x$data) && !is.null(dist)) x$diss <- dist if(is.null(which.plots) && !ask) which.plots <- { if(is.null(x$data) && (is.null(x$diss) || inherits(x, "clara"))) 2 ## no clusplot else 1:2 } if(ask && is.null(which.plots)) { ## Use 'menu' .. tmenu <- paste("plot ", ## choices : c("All", "Clusplot", "Silhouette Plot")) do.all <- FALSE repeat { if(!do.all) pick <- menu(tmenu, title = "\nMake a plot selection (or 0 to exit):\n") + 1 switch(pick, return(invisible())# 0 -> exit loop , do.all <- TRUE# 1 : All , clusplot(x, stand = stand, lines = lines, shade = shade, color = color, labels = labels, plotchar = plotchar, span = span, xlim = xlim, ylim = ylim, main = main, ...) , plot(silhouette(x), nmax.lab, max.strlen, main = main) ) if(do.all) { pick <- pick + 1; do.all <- pick <= length(tmenu) + 1} } invisible() } else { ask <- prod(par("mfcol")) < length(which.plots) && dev.interactive() if(ask) { op <- par(ask = TRUE); on.exit(par(op)) } for(i in which.plots) switch(i, clusplot(x, stand = stand, lines = lines, shade = shade, color = color, labels = labels, plotchar = plotchar, span = span, xlim = xlim, ylim = ylim, main = main, ...) , plot(silhouette(x), nmax.lab, max.strlen, main = main) ) ## and return() whatever *plot(..) returns } } clusplot <- function(x, ...) UseMethod("clusplot") ##' @title Make/Check the (n x 2) matrix needed for clusplot.default(): ##' @param x numeric matrix or dissimilarity matrix (-> clusplot.default()) ##' @param diss logical indicating if 'x' is dissimilarity matrix. In that case, ##' 'cmdscale()' is used, otherwise (typically) 'princomp()'. ##' @return a list with components ##' x1 : (n x 2) numeric matrix; ##' var.dec: a number (in [0,1]), the "variance explained" ##' labs : the point labels (possibly 1:n) ##' @author Martin Maechler mkCheckX <- function(x, diss) { if(diss) { if(anyNA(x)) stop("NA-values are not allowed in dist-like 'x'.") if(inherits(x, "dist")) { n <- attr(x, "Size") labs <- attr(x, "Labels") } else { # x (num.vector or square matrix) must be transformed into diss. siz <- sizeDiss(x) if(is.na(siz)) { if((n <- nrow(x)) != ncol(x)) stop("Distances must be result of dist or a square matrix.") if(all.equal(x, t(x)) != TRUE) stop("the square matrix is not symmetric.") labs <- dimnames(x)[[1]] } else { if(!is.vector(x)) { labs <- attr(x, "Labels") # possibly NULL x <- as.matrix(x) if((n <- nrow(x)) == ncol(x) && all.equal(x, t(x)) == TRUE) { labs <- dimnames(x)[[1]] } else { ## Hmm, when does this ever happen : ## numeric, not-dist, non-vector, not symmetric matrix ? warning(">>>>> funny case in clusplot.default() -- please report!\n") ## if(n != sizeDiss(x)) ... attr(x, "Size") <- siz <- sizeDiss(x) if(is.null(labs)) labs <- 1:siz } } else { attr(x, "Size") <- n <- siz } } } x1 <- cmdscale(x, k = 2, add = TRUE) if(x1$ac < 0) ## Rarely ! (FIXME: need and test example!) x1 <- cmdscale(x, k = 2, eig = TRUE)# TODO: not 'eig', but list. = TRUE for R >= 3.2.2 var.dec <- x1$GOF[2] # always in [0,1] x1 <- x1$points } else { ## Not (diss) if(!is.matrix(x)) stop("x is not a data matrix") if(anyNA(x)) { y <- is.na(x) if(any(apply(y, 1, all))) stop("one or more objects contain only missing values") if(any(apply(y, 2, all))) stop("one or more variables contain only missing values") x <- apply(x, 2, function(x) { x[is.na(x)] <- median(x, na.rm = TRUE); x } ) message("Missing values were displaced by the median of the corresponding variable(s)") } n <- nrow(x) labs <- dimnames(x)[[1]] x1 <- if(ncol(x) <= 1) { var.dec <- 1 matrix(c(t(x), rep(0, length(x))), ncol = 2) } else { prim.pr <- princomp(x, scores = TRUE, cor = ncol(x) > 2) sd2 <- prim.pr$sdev^2 var.dec <- cumsum(sd2/sum(sd2))[2] prim.pr$scores[, 1:2] } } list(x = x1, var.dec = var.dec, labs = if(is.null(labs)) 1:n else labs) } ## mkCheckX() ## TODO: allow components (2,3) or (1,3) instead of always (1,2) => drop 'var.dec', 'sub' clusplot.default <- function(x, clus, diss = FALSE, s.x.2d = mkCheckX(x, diss), stand = FALSE, lines = 2, shade = FALSE, color = FALSE, labels = 0, plotchar = TRUE, col.p = "dark green", # was 5 (= shaded col) col.txt = col.p, col.clus = if(color) c(2, 4, 6, 3) else 5, cex = 1, cex.txt = cex, span = TRUE, add = FALSE, xlim = NULL, ylim = NULL, main = paste("CLUSPLOT(", deparse1(substitute(x)),")"), sub = paste("These two components explain", round(100 * var.dec, digits = 2), "% of the point variability."), xlab = "Component 1", ylab = "Component 2", verbose = getOption("verbose"), ...) { force(main) if(is.data.frame(x)) x <- data.matrix(x) if(!is.numeric(x)) stop("x is not numeric") ## FIXME: - if labels == 0 or == 4, do not need "labs" ## - if !missing(sub), do not need "var.dec" stopifnot(is.list(s.x.2d), c("x","labs","var.dec") %in% names(s.x.2d), (n <- nrow(x1 <- s.x.2d[["x"]])) > 0) labels1 <- s.x.2d[["labs"]] var.dec <- s.x.2d[["var.dec"]] ## --- The 2D space is setup and points are in x1[,] (n x 2) --- clus <- as.vector(clus) if(length(clus) != n) stop("The clustering vector is of incorrect length") clus <- as.factor(clus) if(anyNA(clus)) stop("NA-values are not allowed in clustering vector") if(stand) x1 <- scale(x1) levclus <- levels(clus) nC <- length(levclus) # the number of clusters d.x <- diff(range(x1[, 1])) d.y <- diff(range(x1[, 2])) z <- A <- vector("list", nC) loc <- matrix(0, nrow = nC, ncol = 2) d2 <- verhoud <- numeric(nC) ## num1 .. num6 : all used only once -- there are more constants anyway num3 <- 90 num6 <- 70 for(i in 1:nC) { ##------------- i-th cluster -------------- x <- x1[clus == levclus[i],, drop = FALSE ] aantal <- nrow(x) # number of observations in cluster [i] cov <- var(if(aantal == 1) { if(verbose) cat("cluster",i," has only one observation ..\n") rbind(x, c(0, 0)) } else x) x.1 <- range(x[, 1]) y.1 <- range(x[, 2]) notrank2 <- qr(cov, tol = 0.001)$rank != 2 if(!span && notrank2) { d2[i] <- 1 if((abs(diff(x.1)) > d.x/70) || (abs(diff(y.1)) > d.y/50)) { loc[i, ] <- c(x.1[1] + diff(x.1)/2, y.1[1] + diff(y.1)/2) a <- sqrt((loc[i, 1] - x.1[1])^2 + (loc[i, 2] - y.1[1])^2) a <- a + 0.05 * a num2 <- 40 if(abs(diff(x.1)) > d.x/70 ) { ind1 <- which.max(x[,1]) ind2 <- which.min(x[,1]) q <- atan((x[ind1, 2] - x[ind2, 2])/ (x[ind1, 1] - x[ind2, 1])) b <- if(d.y == 0) 1 else if(abs(diff(y.1)) > d.y/50) diff(y.1)/10 ## num1 <- 10 else d.y/num2 } else { b <- if(d.x == 0) 1 else d.x/num2 q <- pi/2 } D <- diag(c(a^2, b^2)) R <- rbind(c(cos(q), -sin(q)), c(sin(q), cos(q))) A[[i]] <- (R %*% D) %*% t(R) } else { a <- d.x/num3 b <- d.y/num6 if(a == 0) a <- 1 if(b == 0) b <- 1 A[[i]] <- diag(c(a^2, b^2)) loc[i, ] <- x[1, ] } oppervlak <- pi * a * b } else if(span && notrank2) { d2[i] <- 1 if(sum(x[, 1] != x[1, 1]) != 0 || sum(x[, 2] != x[1, 2]) != 0) { loc[i, ] <- c(x.1[1] + diff(x.1)/2, y.1[1] + diff(y.1)/2) a <- sqrt((loc[i, 1] - x.1[1])^2 + (loc[i, 2] - y.1[1])^2) if(any(x[, 1] != x[1, 1])) { ind1 <- which.max(x[,1]) ind2 <- which.min(x[,1]) q <- atan((x[ind1, 2] - x[ind2, 2])/ (x[ind1, 1] - x[ind2, 1])) } else { q <- pi/2 } b <- 1e-7 D <- diag(c(a^2, b^2)) R <- rbind(c(cos(q), -sin(q)), c(sin(q), cos(q))) A[[i]] <- (R %*% D) %*% t(R) } else { a <- d.x/num3 b <- d.y/num6 if(a == 0) a <- 1 if(b == 0) b <- 1 A[[i]] <- diag(c(a^2, b^2)) loc[i, ] <- x[1, ] } oppervlak <- pi * a * b } else { ## rank2 if(!span) { loc[i, ] <- colMeans(x) d2[i] <- max(mahalanobis(x, loc[i, ], cov)) ## * (1+ 0.01)^2 --- dropped factor for back-compatibility } else { ## span and rank2 if(verbose) cat("span & rank2 : calling \"spannel\" ..\n") k <- 2L res <- .C(spannel, aantal, ndep= k, dat = cbind(1., x), sqdist = double(aantal), l1 = double((k+1) ^ 2), double(k), double(k), prob = double(aantal), double(k+1), eps = (0.01),## convergence tol. maxit = 5000L, ierr = integer(1)) if(res$ierr != 0) ## MM : exactmve not available here ! warning("Error in C routine for the spanning ellipsoid,\n rank problem??") cov <- cov.wt(x, res$prob) loc[i, ] <- cov$center ## NB: cov.wt() in R has extra wt[] scaling; revert here: cov <- cov$cov * (1 - sum(cov$wt^2)) d2[i] <- weighted.mean(res$sqdist, res$prob) if(verbose) cat("ellipse( A= (", format(cov[1,]),"*", format(cov[2,2]), "),\n\td2=", format(d2[i]), ", loc[]=", format(loc[i, ]), ")\n") } A[[i]] <- cov ## oppervlak (flam.) = area (Engl.) oppervlak <- pi * d2[i] * sqrt(cov[1, 1] * cov[2, 2] - cov[1, 2]^2) } z[[i]] <- ellipsoidPoints(A[[i]], d2[i], loc[i, ], n.half= 201) verhoud[i] <- aantal/oppervlak } ## end for( i-th cluster ) x.range <- do.call(range, lapply(z, `[`, i=TRUE, j = 1)) y.range <- do.call(range, lapply(z, `[`, i=TRUE, j = 2)) verhouding <- sum(verhoud[verhoud < 1e7]) if(verhouding == 0) verhouding <- 1 ## num4 <- 37 ; num5 <- 3 --- but '41' is another constant density <- 3 + (verhoud * 37)/verhouding density[density > 41] <- 41 if (span) { if (d.x == 0) ## diff(range(x[,1]) == 0 : x-coords all the same x.range <- x1[1, 1] + c(-1,1) if (d.y == 0) ## diff(range(x[,2]) == 0 : y-coords all the same y.range <- x1[1, 2] + c(-1,1) } if(is.null(xlim)) xlim <- x.range if(is.null(ylim)) ylim <- y.range if(length(col.p) < n) col.p <- rep(col.p, length= n) ## --- Now plotting starts --- ## "Main plot" -- if(!add) { plot(x1, xlim = xlim, ylim = ylim, xlab = xlab, ylab = ylab, main = main, type = if(plotchar) "n" else "p", # if(plotchar) add points later col = col.p, cex = cex, ...) if(!is.null(sub) && !is.na(sub) && nchar(sub) > 0) title(sub = sub, adj = 0) } if(color) { if(length(col.clus) < min(4,nC)) stop("'col.clus' should have length 4 when color is TRUE") i.verh <- order(verhoud) jInd <- if(nC > 4) pam(verhoud[i.verh], 4)$clustering else 1:nC for(i in 1:nC) { k <- i.verh[i] polygon(z[[k]], density = if(shade) density[k] else 0, col = col.clus[jInd[i]], ...) } col.clus <- col.clus[jInd][order(i.verh)] } else { for(i in 1:nC) polygon(z[[i]], density = if(shade) density[i] else 0, col = col.clus, ...) } ## points after polygon in order to write ON TOP: if(plotchar) { karakter <- 1:19 for(i in 1:nC) { iC <- clus == levclus[i] points(x1[iC, , drop = FALSE], cex = cex, pch = karakter[1+(i-1) %% 19], col = col.p[iC], ...) } } if(nC > 1 && (lines == 1 || lines == 2)) { ## Draw lines between all pairs of the nC cluster (centers) ## utilities for computing ellipse intersections: clas.snijpunt <- function(x, loc, m, n, p) { if ( !is.na(xm <- x[1,m]) && loc[n, m] <= xm && xm <= loc[p, m]) x[1, ] else if(!is.na(xm <- x[2,m]) && loc[n, m] <= xm && xm <= loc[p, m]) x[2, ] else NA } coord.snijp1 <- function(x, gemid) x[2, 2] - 2 * x[1, 2] * gemid + x[1, 1] * gemid^2 coord.snijp2 <- function(x, d2, y) ((x[1, 1] * x[2, 2] - x[1, 2]^2) * d2)/y coord.snijp3 <- function(xx, y, gemid) { sy <- sqrt(y) sy <- c(sy, -sy) cbind(xx[1] + sy, xx[2] + gemid*sy) } afstand <- matrix(0, ncol = nC, nrow = nC) for(i in 1:(nC - 1)) { for(j in (i + 1):nC) { gemid <- (loc[j, 2] - loc[i, 2])/(loc[j, 1] - loc[i, 1]) s0 <- coord.snijp1(A[[i]], gemid) b0 <- coord.snijp2(A[[i]], d2[i], s0) snijp.1 <- coord.snijp3(loc[i,], y=b0, gemid) s1 <- coord.snijp1(A[[j]], gemid) b1 <- coord.snijp2(A[[j]], d2[j], s1) snijp.2 <- coord.snijp3(loc[j,], y=b1, gemid) if(loc[i, 1] != loc[j, 1]) { if(loc[i, 1] < loc[j, 1]) { punt.1 <- clas.snijpunt(snijp.1, loc, 1, i, j) punt.2 <- clas.snijpunt(snijp.2, loc, 1, i, j) } else { punt.1 <- clas.snijpunt(snijp.1, loc, 1, j, i) punt.2 <- clas.snijpunt(snijp.2, loc, 1, j, i) } } else { if(loc[i, 2] < loc[j, 2]) { punt.1 <- clas.snijpunt(snijp.1, loc, 2, i, j) punt.2 <- clas.snijpunt(snijp.2, loc, 2, i, j) } else { punt.1 <- clas.snijpunt(snijp.1, loc, 2, j, i) punt.2 <- clas.snijpunt(snijp.2, loc, 2, j, i) } } if(is.na(punt.1[1]) || is.na(punt.2[1]) || (sqrt((punt.1[1] - loc[i, 1])^2 + (punt.1[2] - loc[i, 2])^2) + sqrt((punt.2[1] - loc[j, 1])^2 + (punt.2[2] - loc[j, 2])^2)) > sqrt((loc[j, 1] - loc[i, 1])^2 + (loc[j, 2] - loc[i, 2])^2)) { afstand[i, j] <- NA } else if(lines == 1) { afstand[i, j] <- sqrt((loc[i, 1] - loc[j, 1])^2 + (loc[i, 2] - loc[j, 2])^2) segments(loc[i, 1], loc[i, 2], loc[j, 1], loc[j, 2], col = 6, ...) } else { ## lines == 2 afstand[i, j] <- sqrt((punt.1[1] - punt.2[1])^2 + (punt.1[2] - punt.2[2])^2) segments(punt.1[1], punt.1[2], punt.2[1], punt.2[2], col = 6, ...) } } } afstand <- t(afstand) + afstand } else afstand <- NULL if(labels) { if(labels == 1) { for(i in 1:nC) { ## add cluster border points m <- nrow(z[[i]]) ni <- length(ii <- seq(1, m, by = max(1, m %/% 40))) x1 <- rbind(x1, z[[i]][ii, ]) labels1 <- c(labels1, rep(levclus[i], ni)) ## identify() only allows one color: ##col.txt <- c(col.txt, rep(col.clus[if(color) i else 1], ni)) } identify(x1, labels = labels1, col = col.txt[1]) } else { ### FIXME --- 'cex.txt' but also allow to specify 'cex' (for the points) ??? Stext <- function(xy, labs, ...) { ## FIXME: these displacements are not quite ok! xy[, 1] <- xy[, 1] + diff(x.range)/130 xy[, 2] <- xy[, 2] + diff(y.range)/50 text(xy, labels = labs, ...) } if(labels == 3 || labels == 2) Stext(x1, labels1, col = col.txt, cex = cex.txt, ...) if(labels %in% c(2,4,5)) { maxima <- t(sapply(z, `[`, i=201, j=1:2)) Stext(maxima, levclus, font = 4, col = col.clus, cex = cex, ...) } if(labels == 5) identify(x1, labels = labels1, col = col.txt[1]) } } density[density == 41] <- NA invisible(list(Distances = afstand, Shading = density)) } clusplot.partition <- function(x, main = NULL, dist = NULL, ...) { if(is.null(main) && !is.null(x$call)) main <- paste0("clusplot(",format(x$call),")") if(length(x$data) != 0 && (!anyNA(x$data) || data.class(x) == "clara")) clusplot.default(x$data, x$clustering, diss = FALSE, main = main, ...) else if(!is.null(dist)) clusplot.default(dist, x$clustering, diss = TRUE, main = main, ...) else if(!is.null(x$diss)) clusplot.default(x$diss, x$clustering, diss = TRUE, main = main, ...) else { ## try to find "x$diss" by looking at the pam() call: if(!is.null(x$call)) { xD <- try(eval(x$call[[2]], envir = parent.frame())) if(inherits(xD, "try-error") || !inherits(xD, "dist")) stop(gettextf("no diss nor data found, nor the original argument of %s", deparse1(x$call))) ## else ## warning("both 'x$diss' and 'dist' are empty; ", ## "trying to find the first argument of ", deparse1(x$call)) clusplot.default(xD, x$clustering, diss = TRUE, main = main, ...) } else stop("no diss nor data found for clusplot()'") } }