# (c) Simon N. Wood (2013-2022). Provided under GPL 2. ## Routines for gam estimation beyond exponential family. dDeta <- function(y,mu,wt,theta,fam,deriv=0) { ## What is available directly from the family are derivatives of the ## deviance and link w.r.t. mu. This routine converts these to the ## required derivatives of the deviance w.r.t. eta. ## deriv is the order of derivative of the smoothing parameter score ## required. ## This version is based on ratios of derivatives of links rather ## than raw derivatives of links. g2g = g''/g'^2, g3g = g'''/g'^3 etc r <- fam$Dd(y, mu, theta, wt, level=deriv) d <- list(Deta=0,Dth=0,Dth2=0,Deta2=0,EDeta2=0,Detath=0, Deta3=0,Deta2th=0,Detath2=0, Deta4=0,Deta3th=0,Deta2th2=0) if (fam$link=="identity") { ## don't waste time on transformation d$Deta <- r$Dmu;d$Deta2 <- r$Dmu2 d$EDeta2 <- r$EDmu2;d$Deta.Deta2 <- r$Dmu/r$Dmu2 d$Deta.EDeta2 <- r$Dmu/r$EDmu2 if (deriv>0) { d$Dth <- r$Dth; d$Detath <- r$Dmuth d$Deta3 <- r$Dmu3; d$Deta2th <- r$Dmu2th d$EDeta2th <- r$EDmu2th;d$EDeta3 <- r$EDmu3 } if (deriv>1) { d$Deta4 <- r$Dmu4; d$Dth2 <- r$Dth2; d$Detath2 <- r$Dmuth2 d$Deta2th2 <- r$Dmu2th2; d$Deta3th <- r$Dmu3th } } else { ig1 <- fam$mu.eta(fam$linkfun(mu)) ig12 <- ig1^2 g2g <- fam$g2g(mu) d$Deta <- r$Dmu * ig1 d$Deta2 <- r$Dmu2*ig12 - r$Dmu*g2g*ig1 d$EDeta2 <- r$EDmu2*ig12 d$Deta.Deta2 <- r$Dmu/(r$Dmu2*ig1 - r$Dmu*g2g) d$Deta.EDeta2 <- r$Dmu/(r$EDmu2*ig1) if (deriv>0) { ig13 <- ig12 * ig1 d$Dth <- r$Dth d$Detath <- r$Dmuth * ig1 g3g <- fam$g3g(mu) d$Deta3 <- r$Dmu3*ig13 - 3*r$Dmu2 * g2g * ig12 + r$Dmu * (3*g2g^2 - g3g)*ig1 if (!is.null(r$EDmu3)) d$EDeta3 <- r$EDmu3*ig13 - 3*r$EDmu2 * g2g * ig12 ## EDmu=0 d$Deta2th <- r$Dmu2th*ig12 - r$Dmuth*g2g*ig1 if (!is.null(r$EDmu2th)) d$EDeta2th <- r$EDmu2th*ig12 ##- r$EDmuth*g2g*ig1 } if (deriv>1) { g4g <- fam$g4g(mu) d$Deta4 <- ig12^2*r$Dmu4 - 6*r$Dmu3*ig13*g2g + r$Dmu2*(15*g2g^2-4*g3g)*ig12 - r$Dmu*(15*g2g^3-10*g2g*g3g +g4g)*ig1 d$Dth2 <- r$Dth2 d$Detath2 <- r$Dmuth2 * ig1 d$Deta2th2 <- ig12*r$Dmu2th2 - r$Dmuth2*g2g*ig1 d$Deta3th <- ig13*r$Dmu3th - 3 *r$Dmu2th*g2g*ig12 + r$Dmuth*(3*g2g^2-g3g)*ig1 } } ## end of non identity good <- is.finite(d$Deta)&is.finite(d$Deta2) if (deriv>0) { nth <- length(theta) if (nth>1) good <- good&is.finite(rowSums(d$Dth))&is.finite(rowSums(d$Detath))& is.finite(rowSums(d$Deta2th))&is.finite(d$Deta3) else good <- good & is.finite(d$Deta3) & if (nth==0) TRUE else is.finite(d$Dth)&is.finite(d$Detath)&is.finite(d$Deta2th) if (deriv>1) { if (nth>1) good <- good&is.finite(rowSums(d$Dth2))&is.finite(rowSums(d$Detath2))&is.finite(rowSums(d$Deta2th2))& is.finite(rowSums(d$Deta3th))&is.finite(d$Deta4) else good <- good & is.finite(d$Deta4) & if (nth==0) TRUE else is.finite(d$Dth2)&is.finite(d$Detath2)&is.finite(d$Deta2th2)&is.finite(d$Deta3th) } } d$good <- good d } ## dDeta fetad.test <- function(y,mu,wt,theta,fam,eps = 1e-7,plot=TRUE) { ## test family derivatives w.r.t. eta dd <- dDeta(y,mu,wt,theta,fam,deriv=2) dev <- fam$dev.resids(y, mu, wt,theta) mu1 <- fam$linkinv(fam$linkfun(mu)+eps) dev1 <- fam$dev.resids(y,mu1, wt,theta) Deta.fd <- (dev1-dev)/eps cat("Deta: rdiff = ",range(dd$Deta-Deta.fd)," cor = ",cor(dd$Deta,Deta.fd),"\n") plot(dd$Deta,Deta.fd);abline(0,1) nt <- length(theta) if (fam$n.theta && nt>0) for (i in 1:nt) { th1 <- theta;th1[i] <- th1[i] + eps dev1 <- fam$dev.resids(y, mu, wt,th1) Dth.fd <- (dev1-dev)/eps um <- if (nt>1) dd$Dth[,i] else dd$Dth cat("Dth[",i,"]: rdiff = ",range(um-Dth.fd)," cor = ",cor(um,Dth.fd),"\n") plot(um,Dth.fd);abline(0,1) } ## second order up... dd1 <- dDeta(y,mu1,wt,theta,fam,deriv=2) Deta2.fd <- (dd1$Deta - dd$Deta)/eps cat("Deta2: rdiff = ",range(dd$Deta2-Deta2.fd)," cor = ",cor(dd$Deta2,Deta2.fd),"\n") plot(dd$Deta2,Deta2.fd);abline(0,1) Deta3.fd <- (dd1$Deta2 - dd$Deta2)/eps cat("Deta3: rdiff = ",range(dd$Deta3-Deta3.fd)," cor = ",cor(dd$Deta3,Deta3.fd),"\n") plot(dd$Deta3,Deta3.fd);abline(0,1) Deta4.fd <- (dd1$Deta3 - dd$Deta3)/eps cat("Deta4: rdiff = ",range(dd$Deta4-Deta4.fd)," cor = ",cor(dd$Deta4,Deta4.fd),"\n") plot(dd$Deta4,Deta4.fd);abline(0,1) ## and now the higher derivs wrt theta... ind <- 1:nt if (fam$n.theta && nt>0) for (i in 1:nt) { th1 <- theta;th1[i] <- th1[i] + eps dd1 <- dDeta(y,mu,wt,th1,fam,deriv=2) Detath.fd <- (dd1$Deta - dd$Deta)/eps um <- if (nt>1) dd$Detath[,i] else dd$Detath cat("Detath[",i,"]: rdiff = ",range(um-Detath.fd)," cor = ",cor(um,Detath.fd),"\n") plot(um,Detath.fd);abline(0,1) Deta2th.fd <- (dd1$Deta2 - dd$Deta2)/eps um <- if (nt>1) dd$Deta2th[,i] else dd$Deta2th cat("Deta2th[",i,"]: rdiff = ",range(um-Deta2th.fd)," cor = ",cor(um,Deta2th.fd),"\n") plot(um,Deta2th.fd);abline(0,1) Deta3th.fd <- (dd1$Deta3 - dd$Deta3)/eps um <- if (nt>1) dd$Deta3th[,i] else dd$Deta3th cat("Deta3th[",i,"]: rdiff = ",range(um-Deta3th.fd)," cor = ",cor(um,Deta3th.fd),"\n") plot(um,Deta3th.fd);abline(0,1) ## now the 3 second derivative w.r.t. theta terms Dth2.fd <- (dd1$Dth - dd$Dth)/eps um <- if (nt>1) dd$Dth2[,ind] else dd$Dth2 er <- if (nt>1) Dth2.fd[,i:nt] else Dth2.fd cat("Dth2[",i,",]: rdiff = ",range(um-er)," cor = ",cor(as.numeric(um),as.numeric(er)),"\n") plot(um,er);abline(0,1) Detath2.fd <- (dd1$Detath - dd$Detath)/eps um <- if (nt>1) dd$Detath2[,ind] else dd$Detath2 er <- if (nt>1) Detath2.fd[,i:nt] else Detath2.fd cat("Detath2[",i,",]: rdiff = ",range(um-er)," cor = ",cor(as.numeric(um),as.numeric(er)),"\n") ## cat("Detath2[",i,",]: rdiff = ",range(dd$Detath2-Detath2.fd)," cor = ",cor(dd$Detath2,Detath2.fd),"\n") plot(um,er);abline(0,1) Deta2th2.fd <- (dd1$Deta2th - dd$Deta2th)/eps um <- if (nt>1) dd$Deta2th2[,ind] else dd$Deta2th2 er <- if (nt>1) Deta2th2.fd[,i:nt] else Deta2th2.fd cat("Deta2th2[",i,",]: rdiff = ",range(um-er)," cor = ",cor(as.numeric(um),as.numeric(er)),"\n") ## cat("Deta2th2[",i,",]: rdiff = ",range(dd$Deta2th2-Deta2th2.fd)," cor = ",cor(dd$Deta2th2,Deta2th2.fd),"\n") ind <- max(ind)+1:(nt-i) plot(um,er);abline(0,1) } } ## fetad.test corb <- function(x,z) { ## alternative to cor for measuring similarity of x and z, ## which is not scaling invariant. So 1 really means x and z ## are very close, not just linearly related. d <- x-z 1-mean(d^2)/(sd(x)*sd(z)) } fmud.test <- function(y,mu,wt,theta,fam,eps = 1e-7,plot=TRUE) { ## test family deviance derivatives w.r.t. mu ## copy to make debugging easier... Dd <- fam$Dd;dev.resids <- fam$dev.resids dd <- Dd(y, mu, theta, wt, level=2) dev <- dev.resids(y, mu, wt,theta) dev1 <- dev.resids(y, mu+eps, wt,theta) Dmu.fd <- (dev1-dev)/eps cat("Dmu: rdiff = ",range(dd$Dmu-Dmu.fd)," cor = ",corb(dd$Dmu,Dmu.fd),"\n") if (plot) { pch <- 19;cex <- .4 plot(dd$Dmu,Dmu.fd,pch=pch,cex=cex);abline(0,1,col=2) oask <- devAskNewPage(TRUE) on.exit(devAskNewPage(oask)) } nt <- length(theta) if (fam$n.theta>0&&nt>0) for (i in 1:nt) { th1 <- theta;th1[i] <- th1[i] + eps dev1 <- dev.resids(y, mu, wt,th1) Dth.fd <- (dev1-dev)/eps um <- if (nt>1) dd$Dth[,i] else dd$Dth cat("Dth[",i,"]: rdiff = ",range(um-Dth.fd)," cor = ",corb(um,Dth.fd),"\n",sep="") if (plot) { plot(um,Dth.fd,pch=pch,cex=cex,xlab=paste("Dth[",i,"]",sep="")); abline(0,1,col=2)} } ## second order up... dd1 <- Dd(y, mu+eps, theta, wt, level=2) Dmu2.fd <- (dd1$Dmu - dd$Dmu)/eps cat("Dmu2: rdiff = ",range(dd$Dmu2-Dmu2.fd)," cor = ",corb(dd$Dmu2,Dmu2.fd),"\n") if (plot) { plot(dd$Dmu2,Dmu2.fd,pch=pch,cex=cex);abline(0,1,col=2)} Dmu3.fd <- (dd1$Dmu2 - dd$Dmu2)/eps cat("Dmu3: rdiff = ",range(dd$Dmu3-Dmu3.fd)," cor = ",corb(dd$Dmu3,Dmu3.fd),"\n") if (plot) { plot(dd$Dmu3,Dmu3.fd,pch=pch,cex=cex);abline(0,1,col=2)} Dmu4.fd <- (dd1$Dmu3 - dd$Dmu3)/eps cat("Dmu4: rdiff = ",range(dd$Dmu4-Dmu4.fd)," cor = ",corb(dd$Dmu4,Dmu4.fd),"\n") if (plot) { plot(dd$Dmu4,Dmu4.fd,pch=pch,cex=cex);abline(0,1,col=2)} ## and now the higher derivs wrt theta ind <- 1:nt if (fam$n.theta>0&&nt>0) for (i in 1:nt) { th1 <- theta;th1[i] <- th1[i] + eps dd1 <- Dd(y, mu, th1, wt, level=2) Dmuth.fd <- (dd1$Dmu - dd$Dmu)/eps um <- if (nt>1) dd$Dmuth[,i] else dd$Dmuth cat("Dmuth[",i,"]: rdiff = ",range(um-Dmuth.fd)," cor = ",corb(um,Dmuth.fd),"\n") if (plot) { plot(um,Dmuth.fd,pch=pch,cex=cex,xlab=paste("Dmuth[",i,"]",sep=""));abline(0,1,col=2)} Dmu2th.fd <- (dd1$Dmu2 - dd$Dmu2)/eps um <- if (nt>1) dd$Dmu2th[,i] else dd$Dmu2th cat("Dmu2th[",i,"]: rdiff = ",range(um-Dmu2th.fd)," cor = ",corb(um,Dmu2th.fd),"\n") if (plot) { plot(um,Dmu2th.fd,pch=pch,cex=cex,xlab=paste("Dmu2th[",i,"]",sep=""));abline(0,1,col=2)} if (!is.null(dd$EDmu2th)) { EDmu2th.fd <- (dd1$EDmu2 - dd$EDmu2)/eps um <- if (nt>1) dd$EDmu2th[,i] else dd$EDmu2th cat("EDmu2th[",i,"]: rdiff = ",range(um-EDmu2th.fd)," cor = ",corb(um,EDmu2th.fd),"\n") if (plot) { plot(um,EDmu2th.fd,pch=pch,cex=cex,xlab=paste("EDmu2th[",i,"]",sep=""));abline(0,1,col=2)} } Dmu3th.fd <- (dd1$Dmu3 - dd$Dmu3)/eps um <- if (nt>1) dd$Dmu3th[,i] else dd$Dmu3th cat("Dmu3th[",i,"]: rdiff = ",range(um-Dmu3th.fd)," cor = ",corb(um,Dmu3th.fd),"\n") if (plot) { plot(um,Dmu3th.fd,pch=pch,cex=cex,xlab=paste("Dmu3th[",i,"]",sep=""));abline(0,1,col=2)} ## now the 3 second derivative w.r.t. theta terms... Dth2.fd <- (dd1$Dth - dd$Dth)/eps um <- if (nt>1) dd$Dth2[,ind] else dd$Dth2 er <- if (nt>1) Dth2.fd[,i:nt] else Dth2.fd cat("Dth2[",i,",]: rdiff = ",range(um-er)," cor = ",corb(as.numeric(um),as.numeric(er)),"\n") if (plot) { plot(um,er,pch=pch,cex=cex,xlab=paste("Dth2[",i,",]",sep=""),ylab="fd");abline(0,1,col=2)} Dmuth2.fd <- (dd1$Dmuth - dd$Dmuth)/eps um <- if (nt>1) dd$Dmuth2[,ind] else dd$Dmuth2 er <- if (nt>1) Dmuth2.fd[,i:nt] else Dmuth2.fd cat("Dmuth2[",i,",]: rdiff = ",range(um-er)," cor = ",corb(as.numeric(um),as.numeric(er)),"\n") if (plot) { plot(um,er,pch=pch,cex=cex,xlab=paste("Dmuth2[",i,",]",sep=""),ylab="fd");abline(0,1,col=2)} Dmu2th2.fd <- (dd1$Dmu2th - dd$Dmu2th)/eps um <- if (nt>1) dd$Dmu2th2[,ind] else dd$Dmu2th2 er <- if (nt>1) Dmu2th2.fd[,i:nt] else Dmu2th2.fd cat("Dmu2th2[",i,",]: rdiff = ",range(um-er)," cor = ",corb(as.numeric(um),as.numeric(er)),"\n") if (plot) { plot(um,er,pch=pch,cex=cex,xlab=paste("Dmu2th2[",i,",]",sep=""),ylab="fd");abline(0,1,col=2)} ind <- max(ind)+1:(nt-i) } } ## fmud.test gam.fit4 <- function(x, y, sp, Eb,UrS=list(), weights = rep(1, nobs), start = NULL, etastart = NULL, mustart = NULL, offset = rep(0, nobs),U1=diag(ncol(x)), Mp=-1, family = gaussian(), control = gam.control(), deriv=2,gamma=1, scale=1,scoreType="REML",null.coef=rep(0,ncol(x)),nei=NULL,...) { ## Routine for fitting GAMs beyond exponential family. ## Inputs as gam.fit3 except that family is of class "extended.family", while ## sp contains the vector of extended family parameters, followed by the log smoothing parameters, ## followed by the log scale parameter if scale < 0 ## some families have second derivative of deviance, and hence iterative weights ## very close to zero for some data. This can lead to poorly scaled sqrt(w)z ## and it is better to base everything on wz... if (is.null(family$use.wz)) family$use.wz <- FALSE warn <- list() if (family$n.theta>0) { ## there are extra parameters to estimate ind <- 1:family$n.theta theta <- sp[ind] ## parameters of the family family$putTheta(theta) sp <- sp[-ind] ## log smoothing parameters } else theta <- family$getTheta() ## fixed value ## penalized <- if (length(UrS)>0) TRUE else FALSE if (scale>0) scale.known <- TRUE else { ## unknown scale parameter, trial value supplied as ## final element of sp. scale.known <- FALSE nsp <- length(sp) scale <- exp(sp[nsp]) sp <- sp[-nsp] } x <- as.matrix(x) nSp <- length(sp) rank.tol <- .Machine$double.eps*100 ## tolerance to use for rank deficiency q <- ncol(x) n <- nobs <- nrow(x) xnames <- dimnames(x)[[2]] ynames <- if (is.matrix(y)) rownames(y) else names(y) ## Now a stable re-parameterization is needed.... if (length(UrS)) { grderiv <- if (scoreType=="EFS") 1 else deriv rp <- gam.reparam(UrS,sp,grderiv) T <- diag(q) T[1:ncol(rp$Qs),1:ncol(rp$Qs)] <- rp$Qs T <- U1%*%T ## new params b'=T'b old params null.coef <- t(T)%*%null.coef if (!is.null(start)) start <- t(T)%*%start ## form x%*%T in parallel x <- .Call(C_mgcv_pmmult2,x,T,0,0,control$nthreads) rS <- list() for (i in 1:length(UrS)) { rS[[i]] <- rbind(rp$rS[[i]],matrix(0,Mp,ncol(rp$rS[[i]]))) } ## square roots of penalty matrices in current parameterization Eb <- Eb%*%T ## balanced penalty matrix rows.E <- q-Mp Sr <- cbind(rp$E,matrix(0,nrow(rp$E),Mp)) St <- rbind(cbind(rp$S,matrix(0,nrow(rp$S),Mp)),matrix(0,Mp,q)) } else { grderiv <- 0 T <- diag(q); St <- matrix(0,q,q) rSncol <- rows.E <- Eb <- Sr <- 0 rS <- list(0) rp <- list(det=0,det1 = 0,det2 = 0,fixed.penalty=FALSE) } ## re-parameterization complete. Initialization.... nvars <- ncol(x) if (nvars==0) stop("emtpy models not available") if (is.null(weights)) weights <- rep.int(1, nobs) if (is.null(offset)) offset <- rep.int(0, nobs) linkinv <- family$linkinv valideta <- family$valideta validmu <- family$validmu dev.resids <- family$dev.resids ## need an initial `null deviance' to test for initial divergence... ## if (!is.null(start)) null.coef <- start - can be on edge of feasible - not good null.eta <- as.numeric(x%*%null.coef + as.numeric(offset)) ## call the families initialization code... if (is.null(mustart)) { eval(family$initialize) mukeep <- NULL } else { mukeep <- mustart eval(family$initialize) #mustart <- mukeep } old.pdev <- sum(dev.resids(y, linkinv(null.eta), weights,theta)) + t(null.coef)%*%St%*%null.coef if (!is.null(start)) { ## check it's at least better than null.coef pdev <- sum(dev.resids(y, linkinv(x%*%start+as.numeric(offset)), weights,theta)) + t(start)%*%St%*%start if (pdev>old.pdev) start <- mukeep <- etastart <- NULL } coefold <- null.coef ## set to default, may be replaced below etaold <- x %*% coefold + offset if (!is.null(mukeep)) mustart <- mukeep ## and now finalize initialization of mu and eta... eta <- if (!is.null(etastart)) etastart else if (!is.null(start)) if (length(start) != nvars) stop("Length of start should equal ", nvars, " and correspond to initial coefs for ", deparse(xnames)) else { coefold <- start etaold <- offset + as.vector(if (NCOL(x) == 1) x * start else x %*% start) } else family$linkfun(mustart) mu <- linkinv(eta) conv <- boundary <- FALSE dd <- dDeta(y,mu,weights,theta,family,0) ## derivatives of deviance w.r.t. eta w <- dd$Deta2 * .5 wz <- w*(eta-offset) - .5*dd$Deta z <- (eta-offset) - dd$Deta.Deta2 good <- is.finite(z)&is.finite(w) zg <- rep(0,max(dim(x))) for (iter in 1:control$maxit) { ## start of main fitting iteration if (control$trace) cat(iter," ") if (control$trace&sum(!good)>0) cat("\n",sum(!good)," not good\n") if (sum(!good)) { use.wy <- TRUE good <- is.finite(w)&is.finite(wz) z[!is.finite(z)] <- 0 ## avoid NaN in .C call - unused anyway } else use.wy <- family$use.wz if (sum(good)==0) stop("no good data in iteration") ng <- sum(good) zg[1:ng] <- z[good] ## ensure that y dimension large enough for coefs oo <- .C(C_pls_fit1, y=as.double(zg),X=as.double(x[good,]),w=as.double(w[good]),wy = as.double(wz[good]), E=as.double(Sr),Es=as.double(Eb),n=as.integer(ng), q=as.integer(ncol(x)),rE=as.integer(rows.E),eta=as.double(z), penalty=as.double(1),rank.tol=as.double(rank.tol), nt=as.integer(control$nthreads),use.wy=as.integer(use.wy)) posdef <- oo$n >= 0 if (!posdef) { ## then problem is indefinite - switch to +ve weights for this step if (control$trace) cat("**using positive weights\n") # problem is that Fisher can be very poor for zeroes ## index weights that are finite and positive good <- is.finite(dd$Deta2) good[good] <- dd$Deta2[good]>0 w[!good] <- 0 wz <- w*(eta-offset) - .5*dd$Deta z <- (eta-offset) - dd$Deta.Deta2 good <- is.finite(z)&is.finite(w) if (sum(!good)) { use.wy <- TRUE good <- is.finite(w)&is.finite(wz) z[!is.finite(z)] <- 0 ## avoid NaN in .C call - unused anyway } else use.wy <- family$use.wz ng <- sum(good) zg[1:ng] <- z[good] ## ensure that y dimension large enough for coefs oo <- .C(C_pls_fit1, ##C_pls_fit1, y=as.double(zg),X=as.double(x[good,]),w=as.double(w[good]),wy = as.double(wz[good]), E=as.double(Sr),Es=as.double(Eb),n=as.integer(ng), q=as.integer(ncol(x)),rE=as.integer(rows.E),eta=as.double(z), penalty=as.double(1),rank.tol=as.double(rank.tol), nt=as.integer(control$nthreads),use.wy=as.integer(use.wy)) } start <- oo$y[1:ncol(x)] ## current coefficient estimates penalty <- oo$penalty ## size of penalty eta <- drop(x%*%start) ## the linear predictor (less offset) if (any(!is.finite(start))) { ## test for breakdown conv <- FALSE warn[[length(warn)+1]] <- paste(" gam.fit4 non-finite coefficients at iteration ", iter) return(list(REML=NA,warn=warn)) ## return immediately signalling failure } mu <- linkinv(eta <- eta + offset) dev <- sum(dev.resids(y, mu, weights,theta)) ## now step halve under non-finite deviance... if (!is.finite(dev)) { if (is.null(coefold)) { if (is.null(null.coef)) stop("no valid set of coefficients has been found:please supply starting values", call. = FALSE) ## Try to find feasible coefficients from the null.coef and null.eta coefold <- null.coef etaold <- null.eta } ii <- 1 while (!is.finite(dev)) { if (ii > control$maxit) stop("inner loop 1; can't correct step size") ii <- ii + 1 start <- (start + coefold)/2 eta <- (eta + etaold)/2 mu <- linkinv(eta) dev <- sum(dev.resids(y, mu, weights,theta)) } boundary <- TRUE penalty <- t(start)%*%St%*%start ## reset penalty too if (control$trace) cat("Step halved: new deviance =", dev, "\n") } ## end of infinite deviance correction ## now step halve if mu or eta are out of bounds... if (!(valideta(eta) && validmu(mu))) { ii <- 1 while (!(valideta(eta) && validmu(mu))) { if (ii > control$maxit) stop("inner loop 2; can't correct step size") ii <- ii + 1 start <- (start + coefold)/2 eta <- (eta + etaold)/2 mu <- linkinv(eta) } boundary <- TRUE dev <- sum(dev.resids(y, mu, weights,theta)) penalty <- t(start)%*%St%*%start ## need to reset penalty too if (control$trace) cat("Step halved: new deviance =", dev, "\n") } ## end of invalid mu/eta handling ## now check for divergence of penalized deviance.... pdev <- dev + penalty ## the penalized deviance if (control$trace) cat("penalized deviance =", pdev, "\n") div.thresh <- 10*(.1+abs(old.pdev))*.Machine$double.eps^.5 if (pdev-old.pdev>div.thresh) { ## solution diverging ii <- 1 ## step halving counter if (iter==1) { ## immediate divergence, need to shrink towards zero etaold <- null.eta; coefold <- null.coef } while (pdev -old.pdev > div.thresh) { ## step halve until pdev <= old.pdev if (ii > 100) stop("inner loop 3; can't correct step size") ii <- ii + 1 start <- (start + coefold)/2 eta <- (eta + etaold)/2 mu <- linkinv(eta) dev <- sum(dev.resids(y, mu, weights,theta)) penalty <- t(start)%*%St%*%start pdev <- dev + penalty ## the penalized deviance if (control$trace) cat("Step halved: new penalized deviance =", pdev, "\n") } } ## end of pdev divergence if (scoreType=="EFS"&&family$n.theta>0) { ## there are theta parameters to estimate... scale1 <- if (!is.null(family$scale)) family$scale else scale if (family$n.theta>0||scale<0) theta <- estimate.theta(theta,family,y,mu,scale=scale1,wt=weights,tol=1e-7) if (!is.null(family$scale) && family$scale<0) { scale <- exp(theta[family$n.theta+1]) theta <- theta[1:family$n.theta] } family$putTheta(theta) } ## get new weights and pseudodata (needed now for grad testing)... dd <- dDeta(y,mu,weights,theta,family,0) ## derivatives of deviance w.r.t. eta w <- dd$Deta2 * .5; wz <- w*(eta-offset) - .5*dd$Deta z <- (eta-offset) - dd$Deta.Deta2 good <- is.finite(z)&is.finite(w) ## convergence testing... if (posdef && abs(pdev - old.pdev)/(0.1 + abs(pdev)) < control$epsilon) { ## Need to check coefs converged adequately, to ensure implicit differentiation ## ok. Testing coefs unchanged is problematic under rank deficiency (not guaranteed to ## drop same parameter every iteration!) grad <- 2 * t(x[good,,drop=FALSE])%*%((w[good]*(x%*%start)[good]-wz[good]))+ 2*St%*%start if (max(abs(grad)) > control$epsilon*(abs(pdev)+scale)) { old.pdev <- pdev ## not converged quite enough coef <- coefold <- start etaold <- eta ##muold <- mu } else { ## converged conv <- TRUE coef <- start break } } else { ## not converged old.pdev <- pdev coef <- coefold <- start etaold <- eta } if (scoreType=="EFS"&&family$n.theta>0) { ## now recompute pdev with new theta, otherwise step control won't work at next iteration dev <- sum(dev.resids(y, mu, weights,theta)) old.pdev <- pdev <- dev + penalty } } ## end of main loop ## so at this stage the model has been fully estimated coef <- as.numeric(T %*% coef) ## now obtain derivatives, if these are needed... check.derivs <- FALSE while (check.derivs) { ## debugging code to check derivatives eps <- 1e-7 fmud.test(y,mu,weights,theta,family,eps = eps) fetad.test(y,mu,weights,theta,family,eps = eps) } dd <- dDeta(y,mu,weights,theta,family,deriv) w <- dd$Deta2 * .5 z <- (eta-offset) - dd$Deta.Deta2 ## - .5 * dd$Deta[good] / w wf <- pmax(0,dd$EDeta2 * .5) ## Fisher type weights wz <- w*(eta-offset) - 0.5*dd$Deta ## Wz finite when w==0 good <- is.finite(wz)&is.finite(w)&dd$good if (sum(good)==0) stop("not enough finite derivatives") residuals <- z - (eta - offset) residuals[!is.finite(residuals)] <- NA z[!is.finite(z)] <- 0 ## avoid passing NA etc to C code ntot <- length(theta) + length(sp) rSncol <- unlist(lapply(UrS,ncol)) ## Now drop any elements of dd that have been dropped in fitting... if (sum(!good)>0) { ## drop !good from fields of dd, weights and pseudodata z <- z[good]; w <- w[good]; wz <- wz[good]; wf <- wf[good] dd$Deta <- dd$Deta[good];dd$Deta2 <- dd$Deta2[good] dd$EDeta2 <- dd$EDeta2[good] if (deriv>0) dd$Deta3 <- dd$Deta3[good] if (deriv>1) dd$Deta4 <- dd$Deta4[good] if (length(theta)>1) { if (deriv>0) { dd$Dth <- dd$Dth[good,]; dd$Detath <- dd$Detath[good,]; dd$Deta2th <- dd$Deta2th[good,] if (deriv>1) { dd$Detath2 <- dd$Detath2[good,]; dd$Deta3th <- dd$Deta3th[good,] dd$Deta2th2 <- dd$Deta2th2[good,];dd$Dth2 <- dd$Dth2[good,] } } } else { if (deriv>0) { dd$Dth <- dd$Dth[good]; dd$Detath <- dd$Detath[good]; dd$Deta2th <- dd$Deta2th[good] if (deriv>1) { dd$Detath2 <- dd$Detath2[good]; dd$Deta3th <- dd$Deta3th[good] dd$Deta2th2 <- dd$Deta2th2[good]; dd$Dth2 <- dd$Dth2[good] } } } } ## gdi.type should probably be computed after dropping via good gdi.type <- if (any(abs(w)<.Machine$double.xmin*1e20)||any(!is.finite(z))) 1 else 0 if (scoreType=="EFS") scoreType <- "REML" oo <- .C(C_gdi2, X=as.double(x[good,]),E=as.double(Sr),Es=as.double(Eb),rS=as.double(unlist(rS)), U1 = as.double(U1),sp=as.double(exp(sp)),theta=as.double(theta), z=as.double(z),w=as.double(w),wz=as.double(wz),wf=as.double(wf),Dth=as.double(dd$Dth), Det=as.double(dd$Deta), Det2=as.double(dd$Deta2),Dth2=as.double(dd$Dth2),Det.th=as.double(dd$Detath), Det2.th=as.double(dd$Deta2th),Det3=as.double(dd$Deta3),Det.th2 = as.double(dd$Detath2), Det4 = as.double(dd$Deta4),Det3.th=as.double(dd$Deta3th), Deta2.th2=as.double(dd$Deta2th2), beta=as.double(coef),b1=as.double(rep(0,ntot*ncol(x))),w1=as.double(rep(0,ntot*length(z))), D1=as.double(rep(0,ntot)),D2=as.double(rep(0,ntot^2)), P=as.double(0),P1=as.double(rep(0,ntot)),P2 = as.double(rep(0,ntot^2)), ldet=as.double(1-2*(scoreType=="ML")),ldet1 = as.double(rep(0,ntot)), ldet2 = as.double(rep(0,ntot^2)), rV=as.double(rep(0,ncol(x)^2)), rank.tol=as.double(.Machine$double.eps^.75),rank.est=as.integer(0), n=as.integer(sum(good)),q=as.integer(ncol(x)),M=as.integer(nSp), n.theta=as.integer(length(theta)), Mp=as.integer(Mp),Enrow=as.integer(rows.E), rSncol=as.integer(rSncol),deriv=as.integer(deriv), fixed.penalty = as.integer(rp$fixed.penalty),nt=as.integer(control$nthreads), type=as.integer(gdi.type),dVkk=as.double(rep(0,nSp^2))) rV <- matrix(oo$rV,ncol(x),ncol(x)) ## rV%*%t(rV)*scale gives covariance matrix #rV <- rV # transform before return ## derivatives of coefs w.r.t. sps etc... ## note that db.drho and dw.drho start with derivs wrt theta, then wrt sp (no scale param of course) db.drho <- if (deriv) matrix(oo$b1,ncol(x),ntot) else NULL ## transform before return dw.drho <- if (deriv) matrix(oo$w1,length(z),ntot) else NULL Kmat <- matrix(0,nrow(x),ncol(x)) Kmat[good,] <- oo$X ## rV%*%t(K)%*%(sqrt(wf)*X) = F; diag(F) is edf array Vg <- NCV <- NCV1 <- REML <- REML1 <- REML2 <- NULL if (scoreType=="NCV") { eta.cv <- rep(0.0,length(nei$i)) deta.cv <- if (deriv) matrix(0.0,length(nei$i),ntot) else matrix(0.0,1,ntot) w1 <- -dd$Deta/2; w2 <- dd$Deta2/2; dth <- dd$Detath/2 ## !? R <- try(chol(crossprod(x,w*x)+St),silent=TRUE) if (nei$jackknife > 2) { ## return NCV coef changes for each fold if (deriv>0) stop("jackknife and derivatives requested together") dth <- matrix(0,ncol(x),length(nei$m)) deriv1 <- -1 ## signal to return coef changes } else deriv1 <- deriv if (inherits(R,"try-error")) { ## use CG approach... Hi <- tcrossprod(rV) ## inverse of penalized Expected Hessian - inverse actual Hessian probably better cg.iter <- .Call(C_ncv,x,Hi,w1,w2,db.drho,dw.drho,rS,nei$i-1,nei$mi,nei$m,nei$k-1,oo$beta,exp(sp),eta.cv, deta.cv, dth, deriv1); warn[[length(warn)+1]] <- "NCV positive definite update check not possible" } else { ## use Cholesky update approach pdef.fails <- .Call(C_Rncv,x,R,w1,w2,db.drho,dw.drho,rS,nei$i-1,nei$mi,nei$m,nei$k-1,oo$beta,exp(sp),eta.cv, deta.cv, dth, deriv1,.Machine$double.eps,control$ncv.threads); if (pdef.fails) warn[[length(warn)+1]] <- "some NCV updates not positive definite" } mu.cv <- linkinv(eta.cv) nt <- family$n.theta ## if some elements of theta are fixed, figure out which derivatives will be needed... if (deriv) keep <- if (length(theta)>nt) (length(theta)+1):(ncol(db.drho)+!scale.known) else 1:(ncol(db.drho)+!scale.known) dev0 <- dev.resids(y[nei$i], mu[nei$i], weights[nei$i],theta) ls0 <- family$ls(y[nei$i],weights[nei$i],theta,scale) if (family$qapprox) { ## quadratic approximation to NCV qdev <- dev0 + gamma*dd$Deta[nei$i]*(eta.cv-eta[nei$i]) + 0.5*gamma*dd$Deta2[nei$i]*(eta.cv-eta[nei$i])^2 NCV <- sum(qdev)/(2*scale) - ls0$ls if (deriv) { deta <- x %*% db.drho ncv1 <- (dd$Deta[nei$i]*((1-gamma)*deta[nei$i,,drop=FALSE]+gamma*deta.cv) + gamma*dd$Deta2[nei$i]*deta.cv*(eta.cv-eta[nei$i]) + 0.5*gamma*as.numeric(dd$Deta3[nei$i])*deta[nei$i,,drop=FALSE]*(eta.cv-eta[nei$i])^2)/(2*scale) if (nt>0) { ## deal with direct dependence on the theta parameters ncv1[,1:nt] <- ncv1[,1:nt]- ls0$LSTH1[,1:nt] + if (nt==1) (dd$Dth[nei$i] + gamma*dd$Detath[nei$i]*(eta.cv-eta[nei$i]) + 0.5*gamma*dd$Deta2th[nei$i]*(eta.cv-eta[nei$i])^2)/(2*scale) else (dd$Dth[nei$i,] + gamma*dd$Detath[nei$i,]*(eta.cv-eta[nei$i]) + 0.5*gamma*dd$Deta2th[nei$i,]*(eta.cv-eta[nei$i])^2)/(2*scale) } if (!scale.known) { ncv1 <- cbind(ncv1,-qdev/(2*scale) - ls0$LSTH1[,1+nt]) } } } else { ## exact NCV dev.cv <- dev.resids(y, mu.cv, weights,theta) NCV <- sum(dev.cv)/(2*scale) - ls0$ls DEV <- sum(dev0)/(2*scale) - ls0$ls if (gamma!=1) NCV <- gamma*NCV - (gamma-1)*DEV if (deriv) { dd.cv <- dDeta(y[nei$i],mu.cv,weights[nei$i],theta,family,1) ncv1 <- dd.cv$Deta*deta.cv/(2*scale) if (gamma!=1) dev1 <- (dd$Deta*(x%*%db.drho))[nei$i,,drop=FALSE]/(2*scale) if (nt>0) { ncv1[,1:nt] <- ncv1[,1:nt] + as.matrix(dd.cv$Dth/(2*scale)) - ls0$LSTH1[,1:nt] if (gamma!=1) dev1[,1:nt] <- dev1[,1:nt] + as.matrix(dd$Dth/(2*scale))[nei$i,,drop=FALSE] - ls0$LSTH1[,1:nt] } if (!scale.known) { ## deal with log scale parameter derivative ncv1 <- cbind(ncv1,-dev.cv/(2*scale) - ls0$LSTH1[,1+nt]) if (gamma!=1) dev1 <- cbind(dev1,-dev0/(2*scale) - ls0$lSTH1[,1+nt]) } if (gamma!=1) ncv1 <- gamma*ncv1 - (gamma-1)*dev1 } } ## exact NCV if (nei$jackknife>2) { nk <- c(nei$m[1],diff(nei$m)) ## dropped fold sizes jkw <- sqrt((nobs-nk)/(nobs*nk)) ## jackknife weights dth <-jkw*t(dth)%*%t(T) #Vj <- crossprod(dd) ## jackknife cov matrix for coefs (beta) #attr(Vj,"dd") <- dd #attr(NCV,"Vj") <- Vj attr(NCV,"dd") <- dth } attr(NCV,"eta.cv") <- eta.cv if (deriv) { attr(NCV,"deta.cv") <- deta.cv; NCV1 <- colSums(ncv1) NCV1 <- NCV1[keep] ## drop derivatives for any fixed theta parameters Vg <- crossprod(ncv1[,keep,drop=FALSE]) ## empirical cov matrix of grad } } else { ## RE/ML D2 <- matrix(oo$D2,ntot,ntot); ldet2 <- matrix(oo$ldet2,ntot,ntot) bSb2 <- matrix(oo$P2,ntot,ntot) ## compute the REML score... ls <- family$ls(y,weights,theta,scale) nt <- length(theta) lsth1 <- if (nt>0) ls$lsth1[1:nt] else rep(0,0) lsth2 <- if (nt>0) as.matrix(ls$lsth2)[1:nt,1:nt] else matrix(0,0,0) ## exclude any derivs w.r.t log scale here REML <- ((dev+oo$P)/(2*scale) - ls$ls)/gamma + (oo$ldet - rp$det)/2 - as.numeric(scoreType=="REML") * Mp * (log(2*pi*scale)/2-log(gamma)/2) if (deriv) { det1 <- oo$ldet1 if (nSp) { ind <- 1:nSp + length(theta) det1[ind] <- det1[ind] - rp$det1 } REML1 <- ((oo$D1+oo$P1)/(2*scale) - c(lsth1,rep(0,length(sp))))/gamma + (det1)/2 if (deriv>1) { ls2 <- D2*0;if (nt>0) ls2[1:nt,1:nt] <- lsth2 if (nSp) ldet2[ind,ind] <- ldet2[ind,ind] - rp$det2 REML2 <- ((D2+bSb2)/(2*scale) - ls2)/gamma + ldet2/2 } } if (!scale.known&&deriv) { ## need derivatives wrt log scale, too Dp <- dev + oo$P dlr.dlphi <- (-Dp/(2 *scale) - ls$lsth1[nt+1])/gamma - as.numeric(scoreType=="REML") * Mp/2 d2lr.d2lphi <- (Dp/(2*scale) - ls$lsth2[nt+1,nt+1])/gamma d2lr.dspphi <- -(oo$D1+oo$P1)/(2*scale*gamma) if (nt>0) d2lr.dspphi[1:nt] <- d2lr.dspphi[1:nt] - ls$lsth2[nt+1,1:nt]/gamma REML1 <- c(REML1,dlr.dlphi) if (deriv==2) { REML2 <- rbind(REML2,as.numeric(d2lr.dspphi)) REML2 <- cbind(REML2,c(as.numeric(d2lr.dspphi),d2lr.d2lphi)) } } } nth <- length(theta) if (deriv) db.drho <- T %*% db.drho if (deriv>0&&family$n.theta==0&&nth>0) { ## need to drop derivs for fixed theta REML1 <- REML1[-(1:nth)] if (deriv>1) REML2 <- REML2[-(1:nth),-(1:nth)] db.drho <- db.drho[,-(1:nth),drop=FALSE] } names(coef) <- xnames names(residuals) <- ynames wtdmu <- sum(weights * y)/sum(weights) ## has to then be corrected when this is incorrect ## wtdmu <- sum(weights * mu)/sum(weights) ## changed from y nulldev <- sum(dev.resids(y, rep(wtdmu,length(y)), weights)) ## this will be corrected in family postproc n.ok <- nobs - sum(weights == 0) nulldf <- n.ok ww <- wt <- rep.int(0, nobs) wt[good] <- wf ww[good] <- w if (deriv && nrow(dw.drho)!=nrow(x)) { w1 <- dw.drho dw.drho <- matrix(0,nrow(x),ncol(w1)) dw.drho[good,] <- w1 } ## dev is used in family$aic to estimate the scale as dev/sum(weights), or is unused. ## the form is for compatibility with exponential families, but the estimator can ## be biased for low count data in the extended family case. So better to force the ## actual scale estimate to be used... aic.model <- family$aic(y, mu, theta, weights, dev=scale*sum(weights)) # note: incomplete 2*edf needs to be added list(coefficients = coef,residuals=residuals,fitted.values = mu, family=family, linear.predictors = eta,deviance=dev, null.deviance=nulldev,iter=iter, weights=wt, ## note that these are Fisher type weights prior.weights=weights, working.weights = ww, ## working weights df.null = nulldf, y = y, converged = conv,z=z, boundary = boundary, REML=REML,REML1=REML1,REML2=REML2,NCV=NCV,NCV1=NCV1, rV=T %*% rV,db.drho=db.drho,dw.drho=dw.drho, scale.est=scale,reml.scale=scale, aic=aic.model, rank=oo$rank.est, K=Kmat,control=control,Vg=Vg, dVkk = matrix(oo$dVkk,nSp,nSp),ldetS1 = if (grderiv) rp$det1 else 0 #,D1=oo$D1,D2=D2, #ldet=oo$ldet,ldet1=oo$ldet1,ldet2=ldet2, #bSb=oo$P,bSb1=oo$P1,bSb2=bSb2, #ls=ls$ls,ls1=ls$lsth1,ls2=ls$lsth2 ) } ## gam.fit4 efsudr <- function(x,y,lsp,Eb,UrS,weights,family,offset=0,start=NULL,etastart=NULL,mustart=NULL, U1=diag(ncol(x)), intercept = TRUE,scale=1,Mp=-1,control=gam.control(),n.true=-1,...) { ## Extended Fellner-Schall method for regular and extended families, ## with PIRLS performed by gam.fit3/4. ## tr(S^-S_j) is returned by ldetS1, rV %*% t(rV)*scale is ## cov matrix. I think b'S_jb will need to be computed here. nsp <- length(UrS) if (inherits(family,"extended.family")) { spind <- family$n.theta + 1:nsp thind <- if (family$n.theta>0) 1:family$n.theta else rep(0,0) } else { thind <- rep(0,0) spind <- 1:nsp ## index of smoothing params in lsp } estimate.scale <- (length(lsp)>max(spind)) lsp[spind] <- lsp[spind] + 2.5 mult <- 1 fit <- gam.fit3(x=x, y=y, sp=lsp, Eb=Eb,UrS=UrS, weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family, control = control, intercept = intercept,deriv=0, gamma=1,scale=scale,scoreType="EFS", n.true=n.true,...) if (length(thind)>0) lsp[thind] <- family$getTheta() if (estimate.scale) lsp[length(lsp)] <- log(fit$scale) ## Also need scale estimate. OK from gam.fit3, but gam.fit4 version probably needs correcting ## for edf, as obtained via MLE. p <- ncol(x) n <- nrow(x) score.hist <- rep(0,200) bSb <- trVS <- rep(0,nsp) for (iter in 1:200) { start <- fit$coefficients Y <- U1[,1:(ncol(U1)-Mp)] ## penalty range space ## project coefs and rV to Y, since this is space of UrS[[i]] Yb <- drop(t(Y)%*%start) rV <- t(fit$rV) ## so t(rV)%*%rV*scale is cov matrix rVY <- rV %*% Y ## ith penalty is UrS[[i]]%*%t(UrS[[i]])... for (i in 1:length(UrS)) { xx <- Yb %*% UrS[[i]] bSb[i] <- sum(xx^2) xx <- rVY %*% UrS[[i]] trVS[i] <- sum(xx^2) } edf <- p - sum(trVS*exp(lsp[spind])) if (inherits(family,"extended.family")&&estimate.scale) { fit$scale <- fit$scale*n/(n-edf) ## correct for edf. } a <- pmax(0,fit$ldetS1*exp(-lsp[spind]) - trVS) ## NOTE: double check scaling here phi <- if (estimate.scale) fit$scale else scale r <- a/pmax(0,bSb)*phi r[a==0&bSb==0] <- 1 r[!is.finite(r)] <- 1e6 lsp1 <- lsp lsp1[spind] <- pmin(lsp[spind] + log(r)*mult,control$efs.lspmax) max.step <- max(abs(lsp1-lsp)) old.reml <- fit$REML fit <- gam.fit3(x=x, y=y, sp=lsp1, Eb=Eb,UrS=UrS, weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family, control = control, intercept = intercept,deriv=0,mustart=mustart, gamma=1,scale=scale,scoreType="EFS", n.true=n.true,...) if (length(thind)>0) lsp1[thind] <- family$getTheta() if (estimate.scale) lsp1[length(lsp)] <- log(fit$scale) ## some step length control... if (fit$REML<=old.reml) { ## improvement if (max.step<.05) { ## consider step extension (near optimum) lsp2 <- lsp lsp2[spind] <- pmin(lsp[spind] + log(r)*mult*2,control$efs.lspmax) ## try extending step... fit2 <- gam.fit3(x=x, y=y, sp=lsp2, Eb=Eb,UrS=UrS, weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family, control = control, intercept = intercept,deriv=0,mustart=mustart, gamma=1,scale=scale,scoreType="EFS", n.true=n.true,...) if (length(thind)>0) lsp2[thind] <- family$getTheta() if (estimate.scale) lsp2[length(lsp)] <- log(fit$scale) if (fit2$REML < fit$REML) { ## improvement - accept extension fit <- fit2;lsp <- lsp2 mult <- mult * 2 } else { ## accept old step lsp <- lsp1 } } else lsp <- lsp1 } else { ## no improvement while (fit$REML > old.reml&&mult>1) { ## don't contract below 1 as update doesn't have to improve REML mult <- mult/2 ## contract step lsp1 <- lsp lsp1[spind] <- pmin(lsp[spind] + log(r)*mult,control$efs.lspmax) fit <- gam.fit3(x=x, y=y, sp=lsp1, Eb=Eb,UrS=UrS, weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family, control = control, intercept = intercept,deriv=0,mustart=mustart, gamma=1,scale=scale,scoreType="EFS", n.true=n.true,...) if (length(thind)>0) lsp1[thind] <- family$getTheta() if (estimate.scale) lsp1[length(lsp)] <- log(fit$scale) } lsp <- lsp1 if (mult<1) mult <- 1 } score.hist[iter] <- fit$REML ## break if EFS step small and REML change negligible over last 3 steps. if (iter>3 && max.step<.05 && max(abs(diff(score.hist[(iter-3):iter])))0) TRUE else FALSE warn <- list() nSp <- length(lsp) q <- ncol(x) nobs <- length(y) if (penalized) { Eb <- attr(Sl,"E") ## balanced penalty sqrt ## the stability reparameterization + log|S|_+ and derivs... rp <- ldetS(Sl,rho=lsp,fixed=rep(FALSE,length(lsp)),np=q,root=TRUE) x <- Sl.repara(rp$rp,x) ## apply re-parameterization to x Eb <- Sl.repara(rp$rp,Eb) ## root balanced penalty St <- crossprod(rp$E) ## total penalty matrix E <- rp$E ## root total penalty attr(E,"use.unscaled") <- TRUE ## signal initialization code that E not to be further scaled if (!is.null(start)) start <- Sl.repara(rp$rp,start) ## re-para start ## NOTE: it can be that other attributes need re-parameterization here ## this should be done in 'family$initialize' - see mvn for an example. } else { ## unpenalized so no derivatives required deriv <- 0 rp <- list(ldetS=0,rp=list()) St <- matrix(0,q,q) E <- matrix(0,0,q) ## can be needed by initialization code } ## now call initialization code, but make sure that any ## supplied 'start' vector is not overwritten... start0 <- start ## Assumption here is that the initialization code is fine with ## re-parameterized x... eval(family$initialize) if (!is.null(start0)) start <- start0 coef <- as.numeric(start) if (is.null(weights)) weights <- rep.int(1, nobs) if (is.null(offset)) offset <- rep.int(0, nobs) ## get log likelihood, grad and Hessian (w.r.t. coefs - not s.p.s) ... llf <- family$ll ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1) ll0 <- ll$l - drop(t(coef)%*%St%*%coef)/2 grad <- ll$lb - St%*%coef iconv <- max(abs(grad))0) stop("non finite values in Hessian") if (min(D)<=0) { ## 2/2/19 replaces any D<0 indicating indef Dthresh <- max(D)*sqrt(.Machine$double.eps) if (-min(D) < Dthresh) { ## could be indef or +ve semi def indefinite <- FALSE D[D0]),max(ev)*1e-6)*mult mult <- mult*10 ev[ev0) { ## limit step length to .1 of coef length s.norm <- sqrt(sum(step^2)) c.norm <- sqrt(c.norm) if (s.norm > .1*c.norm) step <- step*0.1*c.norm/s.norm } s.norm <- sqrt(sum(step^2)) ## store for scaling steepest if needed ## try the Newton step... coef1 <- coef + step ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=1) ll1 <- ll$l - drop(t(coef1)%*%St%*%coef1)/2 khalf <- 0;fac <- 2 ## with ll1 < ll0 in place of ll1 <= ll0 in next line than we can repeatedly accept ## miniscule steps that do not actually improve anything. llold <- ll ## avoid losing lbb slot on stp failure no.change <- 0 while ((!is.finite(ll1)||ll1 <= ll0) && khalf < 25) { ## step halve until it succeeds... step <- step/fac;coef1 <- coef + step ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=0) ll1 <- ll$l - (t(coef1)%*%St%*%coef1)/2 if (is.finite(ll1)&&ll1>=ll0) { ## improvement, or at least no worse. ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=1) } if (ll1 == ll0) no.change <- no.change + 1 ## abort if step has made no difference (e.g. 2 iteractions of exactly no improvement)... if (max(abs(coef-coef1))1) khalf <- 100 khalf <- khalf + 1 if (khalf>5) fac <- 5 } ## end step halve ## Following tries steepest ascent if Newton failed. Only do this if not initially converged. ## Otherwise it is possible for SA to mess up when routine is called ## with converged parameter values - takes a tiny step that leads to a machine noise ## improvement, but pushes a gradient just over threshold, Newton then continues to fail ## and we get stuck with SA convergence rates. ## One reason to try SA (if grad not already zero) is that Newton step can become ## poor due to numerical conditioning problems, whereas SA is largely unaffected. if (!is.finite(ll1) || (ll1<=ll0 && !iconv)) { ## switch to steepest ascent #step <- 0.5*drop(grad)*mean(abs(coef))/mean(abs(grad)) step <- drop(grad)*s.norm/sqrt(sum(grad^2)) ## scaled to initial Newton step length khalf <- 0 } no.change <- 0 while ((!is.finite(ll1)||(ll1 <= ll0 && !iconv)) && khalf < 25) { ## step cut until it succeeds... step <- step/10;coef1 <- coef + step ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=0) ll1 <- ll$l - (t(coef1)%*%St%*%coef1)/2 if (is.finite(ll1)&&ll1>=ll0) { ## improvement, or no worse ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=1) } if (ll1 == ll0) no.change <- no.change + 1 ## abort if step has made no difference... if (max(abs(coef-coef1))1) khalf <- 100 ## step gone nowhere khalf <- khalf + 1 } if ((is.finite(ll1)&&ll1 >= ll0&&(khalf<25||indefinite))||iter==control$maxit) { ## step ok. Accept and test coef <- coef + step grad <- ll$lb - St%*%coef Hp <- -ll$lbb+St ## convergence test... ok <- (iter==control$maxit || max(abs(grad)) < control$epsilon*abs(ll0)) if (ok) { ## appears to have converged if (indefinite) { ## not a well defined maximum if (perturbed==5) stop("indefinite penalized likelihood in gam.fit5 ") if (iter<4||rank.checked) { perturbed <- perturbed + 1 coef <- coef*(1+(rep_len(c(0,1),length(coef))*.02-.01)*perturbed) + (rep_len(c(0,1),length(coef)) - 0.5 ) * mean(abs(coef))*1e-5*perturbed ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1) ll0 <- ll$l - (t(coef)%*%St%*%coef)/2 grad <- ll$lb - St%*%coef Hp <- -ll$lbb+St } else { rank.checked <- TRUE if (penalized) { Sb <- crossprod(Eb) ## balanced penalty Hb <- -ll$lbb/norm(ll$lbb,"F")+Sb/norm(Sb,"F") ## balanced penalized hessian } else Hb <- -ll$lbb/norm(ll$lbb,"F") ## apply pre-conditioning, otherwise badly scaled problems can result in ## wrong coefs being dropped... D <- abs(diag(Hb)) D[D<1e-50] <- 1;D <- D^-.5 Hb <- t(D*Hb)*D qrh <- qr(Hb,LAPACK=TRUE) rank <- Rrank(qr.R(qrh)) if (rank < q) { ## rank deficient. need to drop and continue to adjust other params drop <- sort(qrh$pivot[(rank+1):q]) ## set these params to zero bdrop <- 1:q %in% drop ## TRUE FALSE version ## now drop the parameters and recompute ll0... lpi <- attr(x,"lpi") xat <- attributes(x) xat$dim <- xat$dimnames <- NULL coef <- coef[-drop] St <- St[-drop,-drop] x <- x[,-drop] ## dropping columns from model matrix if (!is.null(lpi)) { ## need to adjust column indexes as well ii <- (1:q)[!bdrop];ij <- rep(NA,q) ij[ii] <- 1:length(ii) ## col i of old model matrix is col ij[i] of new for (i in 1:length(lpi)) { lpi[[i]] <- ij[lpi[[i]][!(lpi[[i]]%in%drop)]] # drop and shuffle up } } ## lpi adjustment done if (length(xat)>0) for (i in 1:length(xat)) attr(x,names(xat)[i]) <- xat[[i]] attr(x,"lpi") <- lpi attr(x,"drop") <- drop ## useful if family has precomputed something from x ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1) ll0 <- ll$l - (t(coef)%*%St%*%coef)/2 grad <- ll$lb - St%*%coef Hp <- -ll$lbb+St } } } else { ## not indefinite really converged converged <- TRUE # ... don't break until L and D made to match final Hp } } else ll0 <- ll1 ## step ok but not converged yet } else { ## step failed. ll <- llold ## restore old ll with lbb slot if (is.null(drop)) bdrop <- rep(FALSE,q) if (iconv && iter==1) { ## OK to fail on first step if apparently converged to start with converged <- TRUE ## Note: important to check if improvement possible even if apparently coef <- start ## converged, otherwise sp changes can lead to no sp objective change! } else { converged <- FALSE ## NOTE: the threshold can be unrealistic if gradient or step can't be computed to this accuracy, e.g. ## because of very large smoothing parameters - could estimate grad/step accuracy from machine zero ## perturbation of grad/step calc, but perhaps too fussy. coefp <- coef*(1+rep_len(c(-1,1),length(coef))*.Machine$double.eps^.9) llp <- llf(y,x,coef,weights,family,offset=offset,deriv=1) gradp <- llp$lb - St%*%coefp err <- min(1e-3,kappaH*max(1,mean(abs(gradp-grad))/mean(abs(coefp-coef)))*.Machine$double.eps) ## err is an estimate of the acheivable relative error, capped above at 1e-3 to ensure ## this level of stability loss gets reported! if (max(abs(grad/drop(ll0)))>max(err,control$epsilon*2)) warn[[length(warn)+1]] <- paste("gam.fit5 step failed: max magnitude relative grad =",max(abs(grad/drop(ll0)))) } break ## no need to recompute L and D, so break now } } ## end of main fitting iteration ## at this stage the Hessian (of pen lik. w.r.t. coefs) should be +ve semi definite, ## so that the pivoted Choleski factor should exist... if (iter == 2*control$maxit&&converged==FALSE) warn[[length(warn)+1]] <- gettextf("gam.fit5 iteration limit reached: max abs grad = %g",max(abs(grad))) ldetHp <- 2*sum(log(diag(L))) - 2 * sum(log(D)) ## log |Hp| if (!is.null(drop)) { ## create full version of coef with zeros for unidentifiable fcoef <- rep(0,length(bdrop));fcoef[!bdrop] <- coef } else fcoef <- coef dVkk <- d1l <- d2l <- d1bSb <- d2bSb <- d1b <- d2b <- d1ldetH <- d2ldetH <- d1b <- d2b <- NULL ncv <- scoreType=="NCV" if (deriv>0) { ## Implicit differentiation for derivs... m <- nSp d1b <- matrix(0,rank,m) Sib <- Sl.termMult(rp$Sl,fcoef,full=TRUE) ## list of penalties times coefs if (nSp) for (i in 1:m) d1b[,i] <- -D*(backsolve(L,forwardsolve(t(L),(D*Sib[[i]][!bdrop])[piv]))[ipiv]) ## obtain the curvature check matrix... dVkk <- crossprod(L[,ipiv]%*%(d1b/D)) if (!is.null(drop)) { ## create full version of d1b with zeros for unidentifiable fd1b <- matrix(0,q,m) fd1b[!bdrop,] <- d1b } else fd1b <- d1b ## Now call the family again to get first derivative of Hessian w.r.t ## smoothing parameters, in list d1H. Alternatively, if deriv==1 and !ncv ## then tr(Hi d1H) returne as a vector in 'd1H'. ll <- if (ncv) llf(y,x,coef,weights,family,offset=offset,deriv=3,d1b=d1b,ncv=TRUE) else llf(y,x,coef,weights,family,offset=offset,deriv=2+(deriv>1),d1b=d1b,fh=D*t(D*chol2inv(L)[ipiv,ipiv])) # d1l <- colSums(ll$lb*d1b) # cancels if (deriv>1) { ## Implicit differentiation for the second derivatives is now possible... d2b <- matrix(0,rank,m*(m+1)/2) k <- 0 for (i in 1:m) for (j in i:m) { k <- k + 1 v <- -ll$d1H[[i]]%*%d1b[,j] + Sl.mult(rp$Sl,fd1b[,j],i)[!bdrop] + Sl.mult(rp$Sl,fd1b[,i],j)[!bdrop] d2b[,k] <- -D*(backsolve(L,forwardsolve(t(L),(D*v)[piv]))[ipiv]) if (i==j) d2b[,k] <- d2b[,k] + d1b[,i] } ## Now call family for last time to get trHid2H the tr(H^{-1} d^2 H / drho_i drho_j)... llr <- llf(y,x,coef,weights,family,offset=offset,deriv=4,d1b=d1b,d2b=d2b, Hp=Hp,rank=rank,fh = L,D=D) ## Now compute Hessian of log lik w.r.t. log sps using chain rule # d2la <- colSums(ll$lb*d2b) # cancels # k <- 0 d2l <- matrix(0,m,m) for (i in 1:m) for (j in i:m) { # k <- k + 1 d2l[j,i] <- d2l[i,j] <- # d2la[k] + # cancels t(d1b[,i])%*%ll$lbb%*%d1b[,j] } } ## if (deriv > 1) } ## if (deriv > 0) Vg <- NULL ## jackknife grad matrix in NCV case if (scoreType=="NCV") { REML <- REML1 <- REML2 <- NULL if (deriv==0) ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1,d1b=d1b,ncv=TRUE) ## otherwise l1, l2 not returned ncv <- family$ncv ## helps debugging! deriv1 <- if (nei$jackknife>2) -1 else if (deriv==0) 0 else 1 ## create nei if null - now in estimate.gam #if (is.null(nei)||is.null(nei$k)||is.null(nei$m)) nei <- list(i=1:nobs,mi=1:nobs,m=1:nobs,k=1:nobs) ## LOOCV #if (is.null(nei$i)) if (length(nei$m)==nobs) nei$mi <- nei$i <- 1:nobs else stop("unclear which points NCV neighbourhoods belong to") #if (length(nei$mi)!=length(nei$m)) stop("for NCV number of dropped and predicted neighbourhoods must match") ## complete dH if (deriv>0) { for (i in 1:length(ll$d1H)) ll$d1H[[i]] <- ll$d1H[[i]] - Sl.mult(rp$Sl,diag(q),i)[!bdrop,!bdrop] } R1 <- try(chol(t(Hp/D)/D),silent=TRUE) ll$gamma <- gamma; ## note: use of quadratic approx to NCV signalled by family$qapprox #if (overlap||inherits(R1,"try-error")) if (inherits(R1,"try-error")) { ## get H (Hp?) and Hi Hi <- t(D*chol2inv(L)[ipiv,ipiv])*D ret <- ncv(x,y,weights,nei,coef,family,ll,H=t(Hp/D)/D,Hi=Hi,offset=offset,dH=ll$d1H, db=d1b,deriv=deriv1) warn[[length(warn)+1]] <- "NCV update positive definite check not possible" } else { ## cholesky version ret <- ncv(x,y,weights,nei,coef,family,ll,R=R1,offset=offset,dH=ll$d1H,db=d1b, deriv=deriv1,nt=control$ncv.threads) if (ret$error>0) warn[[length(warn)+1]] <- "some NCV updates not positive definite" } NCV <- ret$NCV NCV1 <- ret$NCV1 Vg <- ret$Vg if (deriv1<0) { ## Jackknife cov matrix... nk <- c(nei$m[1],diff(nei$m)) ## dropped fold sizes jkw <- sqrt((nobs-nk)/(nobs*nk)) ## jackknife weights dd <- attr(ret$NCV,"deta.cv") #dd <-jkw*t(dd)%*%t(T) dd <- Sl.repa(rp$rp,t(dd),l=-1) ## undo repara dd <- Sl.inirep(Sl,dd,l=1) ## undo initial repara #Vj <- tcrossprod(dd) ## jackknife cov matrix #attr(NCV,"Vj") <- Vj attr(NCV,"dd") <- jkw*t(dd) } } else { ## REML required NCV <- NCV1 <- NULL ## Compute the derivatives of log|H+S|... if (deriv > 0) { ## only first derivative wrt rho required... if (deriv==1&&!is.list(ll$d1H)) { d1ldetH <- -ll$d1H for (i in 1:m) { A <- Sl.mult(rp$Sl,diag(q),i,full=TRUE)[!bdrop,!bdrop] bind <- rowSums(abs(A))!=0 ## row/cols of non-zero block A <- A[,bind,drop=FALSE] ## drop the zero columns A <- D*(backsolve(L,forwardsolve(t(L),(D*A)[piv,]))[ipiv,,drop=FALSE]) d1ldetH[i] <- d1ldetH[i] + sum(diag(A[bind,,drop=FALSE])) } } else { ## deriv>1 || is.list(ll$d1H) ## computation of d1ldetH using full d1H derivative of Hessian w.r.t. sp matrices... d1ldetH <- rep(0,m) d1Hp <- list() for (i in 1:m) { A <- -ll$d1H[[i]] + Sl.mult(rp$Sl,diag(q),i)[!bdrop,!bdrop] d1Hp[[i]] <- D*(backsolve(L,forwardsolve(t(L),(D*A)[piv,]))[ipiv,,drop=FALSE]) d1ldetH[i] <- sum(diag(d1Hp[[i]])) } } } ## if (deriv >0) if (deriv > 1) { ## second derivatives... d2ldetH <- matrix(0,m,m) k <- 0 for (i in 1:m) for (j in i:m) { k <- k + 1 d2ldetH[i,j] <- -sum(d1Hp[[i]]*t(d1Hp[[j]])) - llr$trHid2H[k] if (i==j) { ## need to add term relating to smoothing penalty A <- Sl.mult(rp$Sl,diag(q),i,full=TRUE)[!bdrop,!bdrop] bind <- rowSums(abs(A))!=0 ## row/cols of non-zero block A <- A[,bind,drop=FALSE] ## drop the zero columns A <- D*(backsolve(L,forwardsolve(t(L),(D*A)[piv,]))[ipiv,,drop=FALSE]) d2ldetH[i,j] <- d2ldetH[i,j] + sum(diag(A[bind,,drop=FALSE])) } else d2ldetH[j,i] <- d2ldetH[i,j] } } ## if (deriv > 1) ## Compute derivs of b'Sb... if (deriv>0) { Skb <- Sl.termMult(rp$Sl,fcoef,full=TRUE) d1bSb <- rep(0,m) for (i in 1:m) { Skb[[i]] <- Skb[[i]][!bdrop] d1bSb[i] <- sum(coef*Skb[[i]]) } } if (deriv>1) { d2bSb <- matrix(0,m,m) for (i in 1:m) { Sd1b <- St%*%d1b[,i] for (j in i:m) { d2bSb[j,i] <- d2bSb[i,j] <- 2*sum( d1b[,i]*Skb[[j]] + d1b[,j]*Skb[[i]] + d1b[,j]*Sd1b) } d2bSb[i,i] <- d2bSb[i,i] + sum(coef*Skb[[i]]) } } ## get grad and Hessian of REML score... REML <- -as.numeric((ll$l - drop(t(coef)%*%St%*%coef)/2)/gamma + rp$ldetS/2 - ldetHp/2 + Mp*(log(2*pi)/2)-log(gamma)/2) REML1 <- if (deriv<1) NULL else -as.numeric( # d1l # cancels - d1bSb/(2*gamma) + rp$ldet1/2 - d1ldetH/2 ) REML2 <- if (deriv<2) NULL else -( (d2l - d2bSb/2)/gamma + rp$ldet2/2 - d2ldetH/2 ) } ## REML computations if (control$trace) { cat("\niter =",iter," ll =",ll$l," REML =",REML," bSb =",t(coef)%*%St%*%coef/2,"\n") cat("log|S| =",rp$ldetS," log|H+S| =",ldetHp," n.drop =",length(drop),"\n") if (!is.null(REML1)) cat("REML1 =",REML1,"\n") } ## Get possibly multiple linear predictors lpi <- attr(x,"lpi") if (is.null(lpi)) { ## only one... linear.predictors <- if (is.null(offset)) as.numeric(x%*%coef) else as.numeric(x%*%coef+offset) fitted.values <- family$linkinv(linear.predictors) } else { ## multiple... fitted.values <- linear.predictors <- matrix(0,nrow(x),length(lpi)) if (!is.null(offset)) offset[[length(lpi)+1]] <- 0 for (j in 1:length(lpi)) { linear.predictors[,j] <- as.numeric(x[,lpi[[j]],drop=FALSE] %*% coef[lpi[[j]]]) if (!is.null(offset[[j]])) linear.predictors[,j] <- linear.predictors[,j] + offset[[j]] fitted.values[,j] <- family$linfo[[j]]$linkinv( linear.predictors[,j]) } } coef <- Sl.repara(rp$rp,fcoef,inverse=TRUE) ## undo re-parameterization of coef #coef <- Sl.repa(rp$rp,fcoef,l=-1) if (!is.null(drop)&&!is.null(d1b)) { ## create full version of d1b with zeros for unidentifiable db.drho <- matrix(0,length(bdrop),ncol(d1b));db.drho[!bdrop,] <- d1b } else db.drho <- d1b ## and undo re-para... #if (!is.null(d1b)) db.drho <- t(Sl.repara(rp$rp,t(db.drho),inverse=TRUE,both.sides=FALSE)) ## wrong if (!is.null(d1b)) db.drho <- Sl.repa(rp$rp,db.drho,l=-1) #if (!is.null(d2b)) d2b <- Sl.repa(rp$rp,d2b,l=-1) ## NOTE: DEBUG only ## Following needed for debugging H derivatives if Cholesky stabilization used #if (!is.null(ll$d1H)) for (i in 1:length(ll$d1H)) ll$d1H[[i]] <- Sl.repa(rp$rp,ll$d1H[[i]],l=2,r=1) ## debug #ll$lbb <- Sl.repa(rp$rp,ll$lbb,l=2,r=1) ## debug ret <- list(coefficients=coef,family=family,y=y,prior.weights=weights, fitted.values=fitted.values, linear.predictors=linear.predictors, scale.est=1, ### NOTE: needed by newton, but what is sensible here? REML= REML,REML1= REML1,REML2=REML2,NCV=NCV,NCV1=NCV1, rank=rank,aic = -2*ll$l, ## 2*edf needs to be added ##deviance = -2*ll$l, l= ll$l,## l1 =d1l,l2 =d2l, lbb = ll$lbb, ## Hessian of log likelihood L=L, ## chol factor of pre-conditioned penalized hessian bdrop=bdrop, ## logical index of dropped parameters D=D, ## diagonal preconditioning matrix St=St, ## total penalty matrix rp = rp$rp,Vg=Vg, db.drho = db.drho, ## derivative of penalty coefs w.r.t. log sps. #bSb = bSb, bSb1 = d1bSb,bSb2 = d2bSb, S1=rp$ldet1, #S=rp$ldetS,S1=rp$ldet1,S2=rp$ldet2, #Hp=ldetHp,Hp1=d1ldetH,Hp2=d2ldetH, #b2 = d2b, iter=iter,H = ll$lbb,dH = ll$d1H,dVkk=dVkk,warn=warn)#,d2H=llr$d2H) ## debugging code to allow components of 2nd deriv of hessian w.r.t. sp.s ## to be passed to deriv.check.... #if (!is.null(ll$ghost1)&&!is.null(ll$ghost2)) { # ret$ghost1 <- ll$ghost1; ret$ghost2 <- ret$ghost2 #} ret } ## end of gam.fit5 efsud <- function(x,y,lsp,Sl,weights=NULL,offset=NULL,family, control=gam.control(),Mp=-1,start=NULL) { ## Extended Fellner-Schall method for general families ## tr(S^-S_j) is returned by ldetS as ldet1 - S1 from gam.fit5 ## b'S_jb is computed as d1bSb in gam.fit5 ## tr(V S_j) will need to be computed using Sl.termMult ## Sl returned by ldetS and Vb computed as in gam.fit5.postproc. tol <- 1e-6 lsp <- lsp + 2.5 mult <- 1 fit <- gam.fit5(x=x,y=y,lsp=lsp,Sl=Sl,weights=weights,offset=offset,deriv=0,family=family, control=control,Mp=Mp,start=start,gamma=1) score.hist <- rep(0,200) tiny <- .Machine$double.eps^.5 ## used to bound above zero for (iter in 1:200) { start <- fit$coefficients ## obtain Vb... ipiv <- piv <- attr(fit$L,"pivot") p <- length(piv) ipiv[piv] <- 1:p Vb <- crossprod(forwardsolve(t(fit$L),diag(fit$D,nrow=p)[piv,,drop=FALSE])[ipiv,,drop=FALSE]) if (sum(fit$bdrop)) { ## some coefficients were dropped... q <- length(fit$bdrop) ibd <- !fit$bdrop Vtemp <- Vb; Vb <- matrix(0,q,q) Vb[ibd,ibd] <- Vtemp } Vb <- Sl.repara(fit$rp,Vb,inverse=TRUE) SVb <- Sl.termMult(Sl,Vb) ## this could be made more efficient trVS <- rep(0,length(SVb)) for (i in 1:length(SVb)) { ind <- attr(SVb[[i]],"ind") trVS[i] <- sum(diag(SVb[[i]][,ind])) } Sb <- Sl.termMult(Sl,start,full=TRUE) bSb <- rep(0,length(Sb)) for (i in 1:length(Sb)) { bSb[i] <- sum(start*Sb[[i]]) } a <- pmax(tiny,fit$S1*exp(-lsp) - trVS) r <- a/pmax(tiny,bSb) r[a==0&bSb==0] <- 1 r[!is.finite(r)] <- 1e6 lsp1 <- pmin(lsp + log(r)*mult,control$efs.lspmax) max.step <- max(abs(lsp1-lsp)) old.reml <- fit$REML fit <- gam.fit5(x=x,y=y,lsp=lsp1,Sl=Sl,weights=weights,offset=offset,deriv=0, family=family,control=control,Mp=Mp,start=start,gamma=1) ## some step length control... if (fit$REML<=old.reml) { ## improvement if (max.step<.05) { ## consider step extension lsp2 <- pmin(lsp + log(r)*mult*2,12) ## try extending step... fit2 <- gam.fit5(x=x,y=y,lsp=lsp2,Sl=Sl,weights=weights,offset=offset,deriv=0,family=family, control=control,Mp=Mp,start=start,gamma=1) if (fit2$REML < fit$REML) { ## improvement - accept extension fit <- fit2;lsp <- lsp2 mult <- mult * 2 } else { ## accept old step lsp <- lsp1 } } else lsp <- lsp1 } else { ## no improvement while (fit$REML > old.reml&&mult>1) { ## don't contract below 1 as update doesn't have to improve REML mult <- mult/2 ## contract step lsp1 <- pmin(lsp + log(r)*mult,control$efs.lspmax) fit <- gam.fit5(x=x,y=y,lsp=lsp1,Sl=Sl,weights=weights,offset=offset,deriv=0,family=family, control=control,Mp=Mp,start=start,gamma=1) } lsp <- lsp1 if (mult<1) mult <- 1 } score.hist[iter] <- fit$REML ## break if EFS step small and REML change negligible over last 3 steps. if (iter>3 && max.step<.05 && max(abs(diff(score.hist[(iter-3):iter])))0) for (i in 1:length(fit$warn)) warning(fit$warn[[i]]) fit } ## efsud gam.fit5.post.proc <- function(object,Sl,L,lsp0,S,off,gamma) { ## object is object returned by gam.fit5, Sl is penalty object, L maps working sp ## vector to full sp vector ## Computes: ## R - unpivoted Choleski of estimated expected hessian of ll ## Vb - the Bayesian cov matrix, ## Ve - "frequentist" alternative ## F - the EDF matrix ## edf = diag(F) and edf2 = diag(2F-FF) ## Main issue is that lbb and lbb + St must be +ve definite for ## F to make sense. ## NOTE: what comes in is in stabilizing parameterization from ## gam.fit5, and may have had parameters dropped. ## possibly initial reparam needs to be undone here as well ## before formation of F.... lbb <- -object$lbb ## Hessian of log likelihood in fit parameterization p <- ncol(lbb) ipiv <- piv <- attr(object$L,"pivot") ipiv[piv] <- 1:p ## Vb0 <- crossprod(forwardsolve(t(object$L),diag(object$D,nrow=p)[piv,])[ipiv,]) ## Bayes cov matrix with learning rate 1/gamma. Wrong - parameterization s.t. Vp*gamma is it #Vl <- if (gamma==1) NULL else chol2inv(chol(-object$lbb/gamma+object$St)) ## need to pre-condition lbb before testing rank... lbb <- object$D*t(object$D*lbb) R <- suppressWarnings(chol(lbb,pivot=TRUE)) if (attr(R,"rank") < ncol(R)) { ## The hessian of the -ve log likelihood is not +ve definite ## Find the "nearest" +ve semi-definite version and use that retry <- TRUE;tol <- 0 eh <- eigen(lbb,symmetric=TRUE) mev <- max(eh$values);dtol <- 1e-7 while (retry) { eh$values[eh$valuessum(edf1)) edf2 <- edf1 ## note hat not possible here... list(Vc=Vc,Vp=Vb,Ve=Ve,V.sp=V.sp,edf=edf,edf1=edf1,edf2=edf2,#F=F, R=R,db.drho=db.drho) } ## gam.fit5.post.proc deriv.check5 <- function(x, y, sp, weights = rep(1, length(y)), start = NULL, offset = rep(0, length(y)),Mp,family = gaussian(), control = gam.control(),deriv=2,eps=1e-7,spe=1e-3, Sl,gamma=1,nei=nei,...) ## FD checking of derivatives for gam.fit5: a debugging routine { if (!deriv%in%c(1,2)) stop("deriv should be 1 or 2") if (control$epsilon>1e-9) control$epsilon <- 1e-9 ## first obtain the fit corresponding to sp... b <- gam.fit5(x=x,y=y,lsp=sp,Sl=Sl,weights=weights,offset=offset,deriv=deriv, family=family,control=control,Mp=Mp,start=start,gamma=gamma,nei=nei) ## now get the derivatives of the likelihood w.r.t. coefs... ll <- family$ll(y=y,X=x,coef=b$coefficients,wt=weights,family=family, deriv=1,d1b=0,d2b=0,Hp=NULL,rank=0,fh=NULL,D=NULL) ## and finite difference versions of these... p <- length(b$coefficients) fdg <- rep(0,p) fdh <- matrix(0,p,p) for (i in 1:p) { coef1 <- b$coefficients;coef1[i] <- coef1[i] + eps ll1 <- family$ll(y=y,X=x,coef=coef1,wt=weights,family=family, deriv=1,d1b=0,d2b=0,Hp=NULL,rank=0,fh=NULL,D=NULL) fdg[i] <- (ll1$l - ll$l)/eps fdh[,i] <- (ll1$lb - ll$lb)/eps } ## display them... oask <- devAskNewPage(TRUE) on.exit(devAskNewPage(oask)) plot(ll$lb,fdg,xlab="computed",ylab="FD",main="grad of log lik");abline(0,1) cat("log lik grad cor. =",cor(ll$lb,fdg),"\n") plot(ll$lbb,fdh,xlab="computed",ylab="FD",main="hess of log lik");abline(0,1) cat("log lik hess cor. =",cor(as.numeric(ll$lbb),as.numeric(fdh)),"\n") ## now we need to investigate the derivatives w.r.t. the log smoothing parameters. M <- length(sp) ## number of smoothing parameters fd.br <- matrix(0,p,M) REML1 <- rep(0,M) REML2 <- b$REML2 fd.dH <- list() if (!is.null(b$b2)) fd.br2 <- b$b2*0 k <- 0 for (i in 1:M) { ## the smoothing parameter loop sp0 <- sp1 <- sp;sp1[i] <- sp[i] + spe/2;sp0[i] <- sp[i] - spe/2 b0 <- gam.fit5(x=x,y=y,lsp=sp0,Sl=Sl,weights=weights,offset=offset,deriv=1, family=family,control=control,Mp=Mp,start=start,gamma=gamma,nei=nei) b1 <- gam.fit5(x=x,y=y,lsp=sp1,Sl=Sl,weights=weights,offset=offset,deriv=1, family=family,control=control,Mp=Mp,start=start,gamma=gamma,nei=nei) fd.br[,i] <- (b1$coefficients - b0$coefficients)/spe if (!is.null(b$b2)) { for (j in i:M) { k <- k + 1 fd.br2[,k] <- (b1$db.drho[,j]-b0$db.drho[,j])/spe } } REML1[i] <- (b1$REML-b0$REML)/spe REML2[,i] <- (b1$REML1-b0$REML1)/spe fd.dH[[i]] <- (b1$lbb - b0$lbb)/spe } ## plot db.drho against fd versions... for (i in 1:M) { plot(b$db.drho[,i],fd.br[,i],xlab="computed",ylab="FD",main="db/drho");abline(0,1) cat("cor db/drho[",i,"] = ",cor(b$db.drho[,i],fd.br[,i]),"\n") } ## second deriv of b if (!is.null(b$b2)) for (i in 1:ncol(b$b2)) { plot(b$b2[,i],fd.br2[,i],xlab="computed",ylab="FD",main="d2b/drhorho");abline(0,1) cat("cor d2b[",i,"] = ",cor(b$b2[,i],fd.br2[,i]),"\n") } ## plot first deriv Hessian against FD version for (i in 1:M) { plot(b$dH[[i]],fd.dH[[i]],xlab="computed",ylab="FD",main="dH/drho");abline(0,1) cat("cor dH/drho[",i,"] = ",cor(as.numeric(b$dH[[i]]),as.numeric(fd.dH[[i]])),"\n") } list(fd=list(lb=fdg,lbb=fdh,REML1=REML1,db.drho=fd.br,dH=fd.dH), lb=ll$lb,lbb=ll$lbb,REML1=b$REML1,db.drho=b$db.drho,dH=b$dH) } ## deriv.check5