# File src/library/stats/R/lm.influence.R # Part of the R package, https://www.R-project.org # # Copyright (C) 1995-2020 The R Core Team # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # A copy of the GNU General Public License is available at # https://www.R-project.org/Licenses/ ### "lm" *and* "glm" leave-one-out influence measures ## this is mainly for back-compatibility (from "lsfit" time) -- use hatvalues()! hat <- function(x, intercept = TRUE) { if(is.qr(x)) n <- nrow(x$qr) else { if(intercept) x <- cbind(1, x) n <- nrow(x) x <- qr(x) } rowSums(qr.qy(x, diag(1, nrow = n, ncol = x$rank))^2) } ## see PR#7961, https://stat.ethz.ch/pipermail/r-devel/2011-January/059642.html weighted.residuals <- function(obj, drop0 = TRUE) { w <- weights(obj) r <- residuals(obj, type="deviance") if(drop0 && !is.null(w)) { if(is.matrix(r)) r[w != 0, , drop = FALSE] # e.g. mlm fit else r[w != 0] } else r } lm.influence <- function (model, do.coef = TRUE) { wt.res <- weighted.residuals(model) e <- na.omit(wt.res) is.mlm <- is.matrix(e) # n x q matrix in the multivariate lm case if (model$rank == 0) { n <- length(wt.res) # drops 0 wt, may drop NAs sigma <- sqrt(deviance(model)/df.residual(model)) res <- list(hat = rep(0, n), coefficients = matrix(0, n, 0), sigma = rep(sigma, n)) } else { ## if we have a point with hat = 1, the corresponding e should be ## exactly zero. Protect against returning Inf by forcing this e[abs(e) < 100 * .Machine$double.eps * median(abs(e))] <- 0 mqr <- qr.lm(model) n <- as.integer(nrow(mqr$qr)) if (is.na(n)) stop("invalid model QR matrix") ## in na.exclude case, omit NAs; also drop 0-weight cases if(NROW(e) != n) stop("non-NA residual length does not match cases used in fitting") do.coef <- as.logical(do.coef) tol <- 10 * .Machine$double.eps res <- .Call(C_influence, mqr, e, tol) if (do.coef){ ok <- seq_len(mqr$rank) # need this for rank-deficient cases Q <- qr.Q(mqr)[ , ok, drop=FALSE] R <- qr.R(mqr)[ok, ok, drop=FALSE] hat <- res$hat invRQtt <- t(backsolve(R,t(Q))) k <- NCOL(Q) q <- NCOL(e) ## NB: The following relies on recycling: diag(v) %*% A == A * v ## so we need a for loop for the mlm case if (is.mlm){ cf <- array(0, c(n,k,q)) for ( j in seq_len(q) ) cf[,,j] <- invRQtt * ifelse(hat == 1, 0, e[,j]/(1-hat)) } else cf <- invRQtt * ifelse(hat == 1, 0, e/(1-hat)) res$coefficients <- cf } drop1d <- function(a) { # more cautious variant of drop(.) d <- dim(a) if(length(d) == 3L && d[[3L]] == 1L) dim(a) <- d[-3L] a } if(is.null(model$na.action)) { if(!is.mlm) { ## drop the 'q=1' array extent (from C) res$sigma <- drop(res$sigma) if(do.coef) res$coefficients <- drop1d(res$coefficients) } } else { hat <- naresid(model$na.action, res$hat) hat[is.na(hat)] <- 0 # omitted cases have 0 leverage res$hat <- hat if(do.coef) { coefficients <- naresid(model$na.action, res$coefficients) coefficients[is.na(coefficients)] <- 0 # omitted cases have 0 change res$coefficients <- if(is.mlm) coefficients else drop1d(coefficients) } sigma <- naresid(model$na.action, res$sigma) sigma[is.na(sigma)] <- sqrt(deviance(model)/df.residual(model)) res$sigma <- if(is.mlm) sigma else drop(sigma) } } res$wt.res <- naresid(model$na.action, e) res$hat[res$hat > 1 - 10*.Machine$double.eps] <- 1 # force 1 names(res$hat) <- names(res$sigma) <- names(res$wt.res) if(do.coef) { cf <- coef(model) if(is.mlm) { # coef is 3d array dnr <- dimnames(res$wt.res) dimnames(res$coefficients) <- list( dnr[[1L]], rownames(cf)[!apply(cf, 1L, anyNA)], dnr[[2L]]) } else { dimnames(res$coefficients) <- list(names(res$wt.res), names(cf)[!is.na(cf)]) } } res[c("hat", "coefficients", "sigma", "wt.res")] # ensure order, for backward compatibility and regression tests } ## The following is adapted from John Fox's "car" : influence <- function(model, ...) UseMethod("influence") influence.lm <- function(model, do.coef = TRUE, ...) lm.influence(model, do.coef = do.coef, ...) influence.glm <- function(model, do.coef = TRUE, ...) { res <- lm.influence(model, do.coef = do.coef, ...) pRes <- na.omit(residuals(model, type = "pearson"))[model$prior.weights != 0] pRes <- naresid(model$na.action, pRes) names(res)[names(res) == "wt.res"] <- "dev.res" c(res, list(pear.res = pRes)) } hatvalues <- function(model, ...) UseMethod("hatvalues") hatvalues.lm <- function(model, infl = lm.influence(model, do.coef=FALSE), ...) infl$hat rstandard <- function(model, ...) UseMethod("rstandard") rstandard.lm <- function(model, infl = lm.influence(model, do.coef=FALSE), sd = sqrt(deviance(model)/df.residual(model)), type = c("sd.1", "predictive"), ...) { type <- match.arg(type) res <- infl$wt.res / switch(type, "sd.1" = c(outer(sqrt(1 - infl$hat), sd)), "predictive" = 1 - infl$hat) res[is.infinite(res)] <- NaN res } ### New version from Brett Presnell, March 2011 ### Slightly modified (dispersion bit) by pd rstandard.glm <- function(model, infl=influence(model, do.coef=FALSE), type=c("deviance","pearson"), ...) { type <- match.arg(type) res <- switch(type, pearson = infl$pear.res, infl$dev.res) res <- res/sqrt(summary(model)$dispersion * (1 - infl$hat)) res[is.infinite(res)] <- NaN res } rstudent <- function(model, ...) UseMethod("rstudent") rstudent.lm <- function(model, infl = lm.influence(model, do.coef=FALSE), res = infl$wt.res, ...) { res <- res / (infl$sigma * sqrt(1 - infl$hat)) res[is.infinite(res)] <- NaN res } rstudent.glm <- function(model, infl = influence(model, do.coef=FALSE), ...) { r <- infl$dev.res r <- sign(r) * sqrt(r^2 + (infl$hat * infl$pear.res^2)/(1 - infl$hat)) r[is.infinite(r)] <- NaN if (any(family(model)$family == c("binomial", "poisson"))) r else r/infl$sigma } ### FIXME for glm (see above) ?!? dffits <- function(model, infl = lm.influence(model, do.coef=FALSE), res = weighted.residuals(model)) { res <- res * sqrt(infl$hat)/(infl$sigma*(1-infl$hat)) res[is.infinite(res)] <- NaN res } dfbeta <- function(model, ...) UseMethod("dfbeta") dfbeta.lm <- function(model, infl = lm.influence(model, do.coef=TRUE), ...) { ## for lm & glm b <- infl$coefficients mlm <- is.matrix(wr <- infl$wt.res) ## is this needed -- check!? if(!mlm) dimnames(b) <- list(names(wr), variable.names(model)) b } dfbetas <- function(model, ...) UseMethod("dfbetas") dfbetas.lm <- function (model, infl = lm.influence(model, do.coef=TRUE), ...) { ## for lm & glm qrm <- qr(model) xxi <- chol2inv(qrm$qr, qrm$rank) db <- dfbeta(model, infl) if(length(dim(db)) == 3L) db <- aperm(db, c(1L,3:2)) db / outer(infl$sigma, sqrt(diag(xxi))) } covratio <- function(model, infl = lm.influence(model, do.coef=FALSE), res = weighted.residuals(model)) { n <- nrow(qr.lm(model)$qr) p <- model$rank omh <- 1-infl$hat e.star <- res/(infl$sigma*sqrt(omh)) e.star[is.infinite(e.star)] <- NaN 1/(omh*(((n - p - 1)+e.star^2)/(n - p))^p) } cooks.distance <- function(model, ...) UseMethod("cooks.distance") ## Used in plot.lm(); allow passing of known parts; `infl' used only via `hat' cooks.distance.lm <- function(model, infl = lm.influence(model, do.coef=FALSE), res = weighted.residuals(model), sd = sqrt(deviance(model)/df.residual(model)), hat = infl$hat, ...) { p <- model$rank res <- ((res/c(outer(1 - hat, sd)))^2 * hat)/p res[is.infinite(res)] <- NaN res } cooks.distance.glm <- function(model, infl = influence(model, do.coef=FALSE), res = infl$pear.res, dispersion = summary(model)$dispersion, hat = infl$hat, ...) { p <- model$rank res <- (res/(1-hat))^2 * hat/(dispersion* p) res[is.infinite(res)] <- NaN res } influence.measures <- function(model, infl = influence(model)) { is.influential <- function(infmat, n)# n == sum(h > 0) [!] { ## Argument is result of using influence.measures d <- dim(infmat) k <- d[[length(d)]] - 4L if(n <= k) stop("too few cases i with h_ii > 0), n < k") absmat <- abs(infmat) r <- if(is.matrix(infmat)) { # usual non-mlm case ## a matrix of logicals structured like the argument cbind(absmat[, 1L:k] > 1, # |dfbetas| > 1 absmat[, k + 1] > 3 * sqrt(k/(n - k)), # |dffit| > .. abs(1 - infmat[, k + 2]) > (3*k)/(n - k),# |1-cov.r| >.. pf(infmat[, k + 3], k, n - k) > 0.5,# "P[cook.d..]" > .5 infmat[, k + 4] > (3 * k)/n) # hat > 3k/n } else { # is.mlm: a 3d-array, not a matrix ## a 3d-array of logicals structured like the argument c(absmat[,, 1L:k] > 1, # |dfbetas| > 1 absmat[,, k + 1] > 3 * sqrt(k/(n - k)), # |dffit| > .. abs(1 - infmat[,, k + 2]) > (3*k)/(n - k),# |1-cov.r| >.. pf(infmat[,, k + 3], k, n - k) > 0.5,# "P[cook.d..]" > .5 infmat[,, k + 4] > (3 * k)/n) # hat > 3k/n } attributes(r) <- attributes(infmat) # dim, dimnames, .. r } p <- model$rank e <- weighted.residuals(model) s <- sqrt(sum(e^2, na.rm=TRUE)/df.residual(model)) mqr <- qr.lm(model) xxi <- chol2inv(mqr$qr, mqr$rank) si <- infl$sigma h <- infl$hat is.mlm <- is.matrix(e) cf <- if(is.mlm) aperm(infl$coefficients, c(1L,3:2)) else infl$coefficients dfbetas <- cf / outer(infl$sigma, sqrt(diag(xxi))) vn <- variable.names(model); vn[vn == "(Intercept)"] <- "1_" dimnames(dfbetas)[[length(dim(dfbetas))]] <- paste0("dfb.", abbreviate(vn)) ## Compatible to dffits(): dffits <- e*sqrt(h)/(si*(1-h)) if(any(ii <- is.infinite(dffits))) dffits[ii] <- NaN cov.ratio <- (si/s)^(2 * p)/(1 - h) cooks.d <- if(inherits(model, "glm")) (infl$pear.res/(1-h))^2 * h/(summary(model)$dispersion * p) else # lm (incl "mlm") ((e/(s * (1 - h)))^2 * h)/p infmat <- if(is.mlm) { # a 3d-array, not a matrix dns <- dimnames(dfbetas) dns[[3]] <- c(dns[[3]], "dffit", "cov.r", "cook.d", "hat") a <- array(dfbetas, dim = dim(dfbetas) + c(0,0, 3+1), dimnames = dns) a[,, "dffit"] <- dffits a[,, "cov.r"] <- cov.ratio a[,,"cook.d"] <- cooks.d a[,, "hat" ] <- h a } else { cbind(dfbetas, dffit = dffits, cov.r = cov.ratio, cook.d = cooks.d, hat = h) } infmat[is.infinite(infmat)] <- NaN is.inf <- is.influential(infmat, sum(h > 0)) ans <- list(infmat = infmat, is.inf = is.inf, call = model$call) class(ans) <- "infl" ans } print.infl <- function(x, digits = max(3L, getOption("digits") - 4L), ...) { ## `x' : as the result of influence.measures(.) cat("Influence measures of\n\t", deparse(x$call),":\n\n") is.star <- apply(x$is.inf, 1L, any, na.rm = TRUE) print(data.frame(x$infmat, inf = ifelse(is.star, "*", " ")), digits = digits, ...) invisible(x) } summary.infl <- function(object, digits = max(2L, getOption("digits") - 5L), ...) { ## object must be as the result of influence.measures(.) is.inf <- object$is.inf isMLM <- length(dim(is.inf)) == 3L ## will have NaN values from any hat=1 rows. is.inf[is.na(is.inf)] <- FALSE is.star <- apply(is.inf, 1L, any) cat("Potentially influential observations of\n\t", deparse(object$call),":\n") if(any(is.star)) { if(isMLM) { is.inf <- is.inf [is.star,,] imat <- object $ infmat[is.star,, , drop = FALSE] } else { # regular "lm" is.inf <- is.inf [is.star, ] imat <- object $ infmat[is.star, , drop = FALSE] } rownam <- dimnames(object $ infmat)[[1L]] %||% format(seq(is.star)) dimnames(imat)[[1L]] <- rownam[is.star] chmat <- format(round(imat, digits = digits)) cat("\n") print(array(paste0(chmat, c("", "_*")[1L + is.inf]), dimnames = dimnames(imat), dim = dim(imat)), quote = FALSE) invisible(imat) } else { cat("NONE\n") numeric() } }