# File src/library/stats/R/loess.R # Part of the R package, https://www.R-project.org # # Copyright (C) 1998-2020 The R Core Team # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # A copy of the GNU General Public License is available at # https://www.R-project.org/Licenses/ loess <- function(formula, data, weights, subset, na.action, model = FALSE, span = 0.75, enp.target, degree = 2L, parametric = FALSE, drop.square = FALSE, normalize = TRUE, family = c("gaussian", "symmetric"), method = c("loess", "model.frame"), control = loess.control(...), ...) { family <- match.arg(family) method <- match.arg(method) mf <- match.call(expand.dots=FALSE) mf$model <- mf$span <- mf$enp.target <- mf$degree <- mf$parametric <- mf$drop.square <- mf$normalize <- mf$family <- mf$method <- mf$control <- mf$... <- NULL ## need stats:: for non-standard evaluation mf[[1L]] <- quote(stats::model.frame) mf <- eval(mf, parent.frame()) if (match.arg(method) == "model.frame") return(mf) mt <- attr(mf, "terms") y <- model.response(mf, "numeric") w <- model.weights(mf) %||% rep_len(1, length(y)) nmx <- as.character(attr(mt, "variables"))[-(1L:2)] x <- mf[, nmx, drop=FALSE] if(any(sapply(x, is.factor))) stop("predictors must all be numeric") x <- as.matrix(x) D <- ncol(x) nmx <- setNames(nm = colnames(x)) drop.square <- match(nmx, nmx[drop.square], 0L) > 0L parametric <- match(nmx, nmx[parametric], 0L) > 0L if(!match(degree, 0L:2L, 0L)) stop("'degree' must be 0, 1 or 2") iterations <- if(family == "gaussian") 1L else control$iterations if(!missing(enp.target)) if(!missing(span)) warning("both 'span' and 'enp.target' specified: 'span' will be used") else { # White book p.321 tau <- switch(degree+1L, 1, D+1, (D+1)*(D+2)/2) - sum(drop.square) span <- 1.2 * tau/enp.target } ## Let's add sanity checks on control if(!is.list(control) || !is.character(control$surface) || !is.character(control$statistics) || !is.character(control$trace.hat) || !is.numeric(control$cell) || !is.numeric(iterations)) stop("invalid 'control' argument") fit <- simpleLoess(y, x, w, span, degree=degree, parametric=parametric, drop.square=drop.square, normalize=normalize, statistics=control$statistics, surface=control$surface, cell=control$cell, iterations=iterations, iterTrace=control$iterTrace, trace.hat=control$trace.hat) fit$call <- match.call() fit$terms <- mt fit$xnames <- nmx fit$x <- x fit$y <- y fit$weights <- w if(model) fit$model <- mf fit$na.action <- attr(mf, "na.action") fit } loess.control <- function(surface = c("interpolate", "direct"), statistics = c("approximate", "exact", "none"), trace.hat = c("exact", "approximate"), cell = 0.2, iterations = 4L, iterTrace = FALSE, ...) { stopifnot(length(iterations) == 1L, !is.na(iterations), as.integer(iterations) > 0L, length(iterTrace) == 1L, !is.na(iterTrace), as.integer(iterTrace) >= 0L) list(surface = match.arg(surface), statistics = match.arg(statistics), trace.hat = match.arg(trace.hat), cell=cell, iterations=iterations, iterTrace=iterTrace) } simpleLoess <- function(y, x, weights, span = 0.75, degree = 2L, parametric = FALSE, drop.square = FALSE, normalize = TRUE, statistics = "approximate", surface = "interpolate", cell, iterations, ## iter. == 1 <==> "gaussian" ## tol = 1e-4, ## <- TODO stop iteration if converged := {rel.change <= tol} iterTrace, trace.hat) { ## loess_ translated to R. D <- as.integer(NCOL(x)) if (is.na(D)) stop("invalid NCOL(X)") if(D > 4) stop("only 1-4 predictors are allowed") N <- as.integer(NROW(x)) if (is.na(N)) stop("invalid NROW(X)") if(!N || !D) stop("invalid 'x'") if(length(y) != N) stop("invalid 'y'") x <- as.matrix(x) storage.mode(x) <- "double" storage.mode(y) <- "double" storage.mode(weights) <- "double" max.kd <- max(N, 200L) robust <- rep_len(1, N) if(normalize && D > 1L) { trim <- ceiling(0.1 * N) divisor <- sqrt(apply(apply(x, 2L, sort)[seq(trim+1, N-trim), , drop = FALSE], 2L, var)) x <- x/rep(divisor, rep_len(N, D)) } else divisor <- 1 sum.drop.sqr <- sum(drop.square) sum.parametric <- sum(parametric) nonparametric <- sum(!parametric) order.parametric <- order(parametric) x <- x[, order.parametric] order.drop.sqr <- (2L - drop.square)[order.parametric] if(degree == 1L && sum.drop.sqr) stop("specified the square of a factor predictor to be dropped when degree = 1") if(D == 1L && sum.drop.sqr) stop("specified the square of a predictor to be dropped with only one numeric predictor") if(sum.parametric == D) stop("specified parametric for all predictors") if (length(span) != 1L) stop("invalid argument 'span'") if (length(cell) != 1L) stop("invalid argument 'cell'") if (length(degree) != 1L) stop("invalid argument 'degree'") if(surface == "interpolate" && statistics == "approximate") # default statistics <- if(trace.hat == "exact") "1.approx" else "2.approx" # trace.hat == "approximate" surf.stat <- paste(surface, statistics, sep = "/") do.rob <- (iterations > 1L) # will do robustness iter. if(!do.rob && iterTrace) { warning(sprintf(gettext("iterTrace = %d is not obeyed since iterations = %d"), iterTrace, iterations)) iterTrace <- FALSE } no.st <- (statistics == "none") if(iterTrace) wRSS <- NA for(j in seq_len(iterations)) { no.st <- (statistics == "none") z <- .C(C_loess_raw, # ../src/loessc.c y, x, if(no.st) 1 else weights, if(no.st) weights * robust else 1, D, N, as.double(span), as.integer(degree), as.integer(nonparametric), as.integer(order.drop.sqr), as.integer(sum.drop.sqr), as.double(span*cell), as.character(surf.stat), fitted.values = double(N), parameter = integer(7L), a = integer(max.kd), xi = double(max.kd), vert = double(2L*D), vval = double((D+1L)*max.kd), diagonal = double(N), trL = double(1L), delta1 = double(1L), delta2 = double(1L), as.integer(surf.stat == "interpolate/exact")) fitted <- z$fitted.values fitted.residuals <- y - fitted if(j < iterations) { ## update robustness weights, ## not for *last* iteration, so they remain consistent with 'fitted.values' if(iterTrace) old.rob <- robust robust <- .Fortran(C_lowesw, fitted.residuals, N, robust = double(N), integer(N))$robust } if(j == 1) { trace.hat.out <- z$trL one.delta <- z$delta1 two.delta <- z$delta2 if(do.rob) { statistics <- "none" surf.stat <- paste(surface, statistics, sep = "/") no.st <- TRUE } } if(iterTrace) { oSS <- wRSS wRSS <- sum(weights * fitted.residuals^2) del.SS <- abs(oSS-wRSS)/(if(wRSS == 0) 1 else wRSS) d.rob.w <- if(j < iterations) ## have updated 'robust', see above sum(abs(old.rob - robust)) / sum(robust) else NA cat(sprintf( "iter.%2d: wRSS=%#14.9g, rel. changes: (SS=%#9.4g, rob.wgts=%#9.4g)\n", j, wRSS, del.SS, d.rob.w)) if(iterTrace >= 2 && j < iterations) { cat("robustness weights:\n") print(quantile(robust, probs=(0:8)/8), digits=3) } } } ## end { iterations } if(surface == "interpolate") { pars <- setNames(z$parameter, c("d", "n", "vc", "nc", "nv", "liv", "lv")) enough <- (D + 1L) * pars[["nv"]] fit.kd <- list(parameter=pars, a=z$a[1L:pars[4L]], xi=z$xi[1L:pars[4L]], vert=z$vert, vval=z$vval[1L:enough]) } if(do.rob) { pseudovalues <- .Fortran(C_lowesp, # lowesp() in ../src/loessf.f N, as.double(y), fitted, as.double(weights), # 'pwgts' as.double(robust), # 'rwgts' integer(N), pseudovalues = double(N))$pseudovalues zz <- .C(C_loess_raw, pseudovalues, x, weights, weights, D, N, as.double(span), as.integer(degree), as.integer(nonparametric), as.integer(order.drop.sqr), as.integer(sum.drop.sqr), as.double(span*cell), as.character(surf.stat), ## == /none fitted = double(N), parameter = integer(7L), a = integer(max.kd), xi = double(max.kd), vert = double(2L*D), vval = double((D+1L)*max.kd), diagonal = double(N), trL = double(1L), delta1 = double(1L), delta2 = double(1L), 0L)[["fitted"]] pseudo.resid <- pseudovalues - zz } sum.squares <- if(do.rob) sum (weights * pseudo.resid^2) else sum(weights * fitted.residuals^2) enp <- one.delta + 2*trace.hat.out - N s <- sqrt(sum.squares/one.delta) ## return structure( class = "loess", list(n = N, fitted = fitted, residuals = fitted.residuals, enp = enp, s = s, one.delta = one.delta, two.delta = two.delta, trace.hat = trace.hat.out, divisor = divisor, robust = robust, pars = list(span = span, degree = degree, normalize = normalize, parametric = parametric, drop.square = drop.square, surface = surface, cell = cell, family = if(iterations <= 1L) "gaussian" else "symmetric", trace.hat = trace.hat, iterations = iterations), kd = if(surface == "interpolate") fit.kd)) } predict.loess <- function(object, newdata = NULL, se = FALSE, na.action = na.pass, ...) { if(!inherits(object, "loess")) stop("first argument must be a \"loess\" object") if(is.null(newdata) && !se) return(fitted(object)) op <- object$pars res <- predLoess(object$y, object$x, newx = if(is.null(newdata)) object$x else if(is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)), newdata, na.action = na.action)) else as.matrix(newdata), # this case is undocumented object$ s, object$ weights, object$ robust, op$span, op$degree, op$normalize, op$parametric, op$drop.square, op$surface, op$cell, op$family, object$ kd, object$ divisor, se = se) if(!is.null(out.attrs <- attr(newdata, "out.attrs"))) { # expand.grid used if(se) { res$fit <- array(res$fit, out.attrs$dim, out.attrs$dimnames) res$se.fit <- array(res$se.fit, out.attrs$dim, out.attrs$dimnames) } else res <- array(res, out.attrs$dim, out.attrs$dimnames) } if(se) res$df <- object$one.delta^2/object$two.delta res } predLoess <- function(y, x, newx, s, weights, robust, span, degree, normalize, parametric, drop.square, surface, cell, family, kd, divisor, se = FALSE) { ## translation of pred_ D <- NCOL(x); N <- NROW(x); M <- NROW(newx) x <- as.matrix(x); newx <- as.matrix(newx) if(any(divisor != 1)) { newx <- newx/rep(divisor, rep_len(M, D)) x <- x /rep(divisor, rep_len(N, D)) } sum.drop.sqr <- sum(drop.square) nonparametric <- sum(!parametric) order.parametric <- order(parametric) x <- x[, order.parametric, drop=FALSE] x.evaluate <- newx[, order.parametric, drop=FALSE] order.drop.sqr <- (2L - drop.square)[order.parametric] storage.mode(x) <- "double" storage.mode(y) <- "double" if(surface == "direct") { nas <- rowSums(is.na(newx)) > 0 fit <- rep_len(NA_real_, length(nas)) x.evaluate <- x.evaluate[!nas,, drop = FALSE] M <- nrow(x.evaluate) if(se) { se.fit <- fit z <- .C(C_loess_dfitse, y, x, as.double(x.evaluate), as.double(weights*robust), as.double(robust), as.integer(family =="gaussian"), as.double(span), as.integer(degree), as.integer(nonparametric), as.integer(order.drop.sqr), as.integer(sum.drop.sqr), as.integer(D), as.integer(N), as.integer(M), fit = double(M), L = double(N*M))[c("fit", "L")] fit[!nas] <- z$fit ses <- rowSums(matrix(z$L^2, M, N) / rep(weights, rep_len(M,N))) se.fit[!nas] <- s * sqrt(ses) } else { fit[!nas] <- .C(C_loess_dfit, y, x, as.double(x.evaluate), as.double(weights*robust), as.double(span), as.integer(degree), as.integer(nonparametric), as.integer(order.drop.sqr), as.integer(sum.drop.sqr), as.integer(D), as.integer(N), as.integer(M), fit = double(M))$fit } } else { ## interpolate ## need to eliminate points outside original range - not in pred_ ranges <- apply(x, 2L, range) inside <- rowSums((x.evaluate <= rep(ranges[2L,], rep_len(M, D))) & (x.evaluate >= rep(ranges[1L,], rep_len(M, D)))) == D inside[is.na(inside)] <- FALSE M1 <- sum(inside) fit <- rep_len(NA_real_, M) if(any(inside)) fit[inside] <- .C(C_loess_ifit, as.integer(kd$parameter), as.integer(kd$a), as.double(kd$xi), as.double(kd$vert), as.double(kd$vval), as.integer(M1), as.double(x.evaluate[inside, ]), fit = double(M1))$fit if(se) { se.fit <- rep_len(NA_real_, M) if(any(inside)) { L <- .C(C_loess_ise, y, x, as.double(x.evaluate[inside, ]), as.double(weights), as.double(span), as.integer(degree), as.integer(nonparametric), as.integer(order.drop.sqr), as.integer(sum.drop.sqr), as.double(span*cell), as.integer(D), as.integer(N), as.integer(M1), double(M1), L = double(N*M1) )$L tmp <- rowSums(matrix(L^2, M1, N) / rep(weights, rep_len(M1,N))) se.fit[inside] <- s * sqrt(tmp) } } } rn <- rownames(newx) if(se) { if(!is.null(rn)) names(fit) <- names(se.fit) <- rn list(fit = fit, se.fit = drop(se.fit), residual.scale = s) } else { if(!is.null(rn)) names(fit) <- rn fit } } pointwise <- function(results, coverage) { fit <- results$fit lim <- qt((1 - coverage)/2, results$df, lower.tail = FALSE) * results$se.fit list(fit = fit, lower = fit - lim, upper = fit + lim) } print.loess <- function(x, digits = max(3L, getOption("digits") - 3L), ...) { if(!is.null(cl <- x$call)) { cat("Call:\n") dput(cl, control=NULL) } cat("\nNumber of Observations:", x$n, "\n") cat("Equivalent Number of Parameters:", format(round(x$enp, 2L)), "\n") cat("Residual", if(x$pars$family == "gaussian")"Standard Error:" else "Scale Estimate:", format(signif(x$s, digits)), "\n") invisible(x) } summary.loess <- function(object, ...) { class(object) <- "summary.loess" object } print.summary.loess <- function(x, digits = max(3L, getOption("digits") - 3L), ...) { print.loess(x, digits=digits, ...) cat("Trace of smoother matrix: ", format(round(x$trace.hat, 2L)), " (",x$pars$trace.hat, ")\n", sep="") cat("\nControl settings:\n") cat(" span : ", format(x$pars$span), "\n") cat(" degree : ", x$pars$degree, "\n") cat(" family : ", x$pars$family) if(x$pars$family != "gaussian") cat(" iterations =", x$pars$iterations) cat("\n surface : ", x$pars$surface) if(x$pars$surface == "interpolate") cat(" cell =", format(x$pars$cell)) cat("\n normalize: ", x$pars$normalize) cat("\n parametric: ", x$pars$parametric) cat("\ndrop.square: ", x$pars$drop.square, "\n") invisible(x) } scatter.smooth <- function(x, y = NULL, span = 2/3, degree = 1, family = c("symmetric", "gaussian"), xlab = NULL, ylab = NULL, ylim = range(y, pred$y, na.rm = TRUE), evaluation = 50, ..., lpars = list()) { xlabel <- if (!missing(x)) deparse1(substitute(x)) ylabel <- if (!missing(y)) deparse1(substitute(y)) xy <- xy.coords(x, y, xlabel, ylabel) x <- xy$x y <- xy$y xlab <- if (is.null(xlab)) xy$xlab else xlab ylab <- if (is.null(ylab)) xy$ylab else ylab pred <- loess.smooth(x, y, span, degree, family, evaluation) plot(x, y, ylim = ylim, xlab = xlab, ylab = ylab, ...) do.call(lines, c(list(pred), lpars)) invisible() } loess.smooth <- function(x, y, span = 2/3, degree = 1, family = c("symmetric", "gaussian"), evaluation = 50, ...) { notna <- !(is.na(x) | is.na(y)) x <- x[notna]; y <- y[notna] new.x <- seq.int(min(x), max(x), length.out = evaluation) control <- loess.control(...) w <- rep_len(1, length(y)) family <- match.arg(family) iterations <- if(family == "gaussian") 1L else control$iterations kd <- simpleLoess(y, x, w, span, degree=degree, parametric=FALSE, drop.square=FALSE, normalize=FALSE, statistics="none", surface="interpolate", cell=control$cell, iterations=iterations, iterTrace=control$iterTrace, trace.hat=control$trace.hat)$kd z <- .C(C_loess_ifit, as.integer(kd$parameter), as.integer(kd$a), as.double(kd$xi), as.double(kd$vert), as.double(kd$vval), as.integer(evaluation), as.double(new.x), fit = double(evaluation))$fit list(x = new.x, y = z) } anova.loess <- function(object, ...) { objects <- list(object, ...) responses <- as.character(lapply(objects, function(x) as.character(x$terms[[2L]]))) sameresp <- responses == responses[1L] ## calculate the number of models if (!all(sameresp)) { objects <- objects[sameresp] warning(gettextf("models with response %s removed because response differs from model 1", sQuote(deparse(responses[!sameresp]))), domain = NA) } nmodels <- length(objects) if(nmodels <= 1L) stop("no models to compare") models <- as.character(lapply(objects, function(x) x$call)) descr <- paste0("Model ", format(1L:nmodels), ": ", models, collapse = "\n") ## extract statistics delta1 <- sapply(objects, function(x) x$one.delta) delta2 <- sapply(objects, function(x) x$two.delta) s <- sapply(objects, function(x) x$s) enp <- sapply(objects, function(x) x$enp) rss <- s^2*delta1 max.enp <- order(enp)[nmodels] d1diff <- abs(diff(delta1)) dfnum <- c(d1diff^2/abs(diff(delta2))) dfden <- (delta1^2/delta2)[max.enp] Fvalue <- c(NA, (abs(diff(rss))/d1diff)/s[max.enp]^2) pr <- pf(Fvalue, dfnum, dfden, lower.tail = FALSE) ans <- data.frame(ENP = round(enp,2L), RSS = rss, "F-value" = Fvalue, "Pr(>F)" = pr, check.names = FALSE) attr(ans, "heading") <- paste0(descr, "\n\n", "Analysis of Variance: denominator df ", format(round(dfden, 2L)), "\n") class(ans) <- c("anova", "data.frame") ans }