# (c) Simon N. Wood (2013-2022). Provided under GPL 2.
## Routines for gam estimation beyond exponential family.

dDeta <- function(y,mu,wt,theta,fam,deriv=0) {
## What is available directly from the family are derivatives of the 
## deviance and link w.r.t. mu. This routine converts these to the
## required derivatives of the deviance w.r.t. eta.
## deriv is the order of derivative of the smoothing parameter score 
## required.
## This version is based on ratios of derivatives of links rather 
## than raw derivatives of links. g2g = g''/g'^2, g3g = g'''/g'^3 etc 
   r <- fam$Dd(y, mu, theta, wt, level=deriv)  
   d <- list(Deta=0,Dth=0,Dth2=0,Deta2=0,EDeta2=0,Detath=0,
             Deta3=0,Deta2th=0,Detath2=0,
             Deta4=0,Deta3th=0,Deta2th2=0)
   if (fam$link=="identity") { ## don't waste time on transformation
      d$Deta <- r$Dmu;d$Deta2 <- r$Dmu2
      d$EDeta2 <- r$EDmu2;d$Deta.Deta2 <- r$Dmu/r$Dmu2
      d$Deta.EDeta2 <- r$Dmu/r$EDmu2
      if (deriv>0) {
        d$Dth <- r$Dth; d$Detath <- r$Dmuth
        d$Deta3 <- r$Dmu3; d$Deta2th <- r$Dmu2th
	d$EDeta2th <- r$EDmu2th;d$EDeta3 <- r$EDmu3
      }
      if (deriv>1) {
        d$Deta4 <- r$Dmu4; d$Dth2 <- r$Dth2; d$Detath2 <- r$Dmuth2
        d$Deta2th2 <- r$Dmu2th2; d$Deta3th <- r$Dmu3th
      }
   } else {

     ig1 <- fam$mu.eta(fam$linkfun(mu)) 
     ig12 <- ig1^2
    
     g2g <- fam$g2g(mu)

     d$Deta <- r$Dmu * ig1
     d$Deta2 <- r$Dmu2*ig12 - r$Dmu*g2g*ig1
     d$EDeta2 <- r$EDmu2*ig12
     d$Deta.Deta2 <- r$Dmu/(r$Dmu2*ig1 - r$Dmu*g2g)
     d$Deta.EDeta2 <- r$Dmu/(r$EDmu2*ig1)
     if (deriv>0) {
        ig13 <- ig12 * ig1
        d$Dth <- r$Dth 
        d$Detath <- r$Dmuth * ig1
        g3g <- fam$g3g(mu)
        d$Deta3 <- r$Dmu3*ig13 - 3*r$Dmu2 * g2g * ig12 + r$Dmu * (3*g2g^2 - g3g)*ig1
        if (!is.null(r$EDmu3)) d$EDeta3 <- r$EDmu3*ig13 - 3*r$EDmu2 * g2g * ig12 ## EDmu=0
        d$Deta2th <- r$Dmu2th*ig12 - r$Dmuth*g2g*ig1
        if (!is.null(r$EDmu2th)) d$EDeta2th <- r$EDmu2th*ig12 ##- r$EDmuth*g2g*ig1
     }
     if (deriv>1) {
       g4g <- fam$g4g(mu)
       d$Deta4 <- ig12^2*r$Dmu4 - 6*r$Dmu3*ig13*g2g + r$Dmu2*(15*g2g^2-4*g3g)*ig12 - 
                       r$Dmu*(15*g2g^3-10*g2g*g3g  +g4g)*ig1
       d$Dth2 <- r$Dth2
       d$Detath2 <- r$Dmuth2 * ig1 
       d$Deta2th2 <- ig12*r$Dmu2th2 - r$Dmuth2*g2g*ig1
       d$Deta3th <-  ig13*r$Dmu3th - 3 *r$Dmu2th*g2g*ig12 + r$Dmuth*(3*g2g^2-g3g)*ig1
     }
   } ## end of non identity
   good <- is.finite(d$Deta)&is.finite(d$Deta2)
   if (deriv>0) {
     nth <- length(theta)
     if (nth>1) good <- good&is.finite(rowSums(d$Dth))&is.finite(rowSums(d$Detath))&
                                 is.finite(rowSums(d$Deta2th))&is.finite(d$Deta3) else
     good <- good & is.finite(d$Deta3) & if (nth==0) TRUE else
             is.finite(d$Dth)&is.finite(d$Detath)&is.finite(d$Deta2th)
     if (deriv>1) { 
       if (nth>1) good <- good&is.finite(rowSums(d$Dth2))&is.finite(rowSums(d$Detath2))&is.finite(rowSums(d$Deta2th2))&
                          is.finite(rowSums(d$Deta3th))&is.finite(d$Deta4) else
       good <- good & is.finite(d$Deta4) & if (nth==0) TRUE else
               is.finite(d$Dth2)&is.finite(d$Detath2)&is.finite(d$Deta2th2)&is.finite(d$Deta3th)
     }
   }	   
   d$good <- good	   
   d
} ## dDeta

fetad.test <- function(y,mu,wt,theta,fam,eps = 1e-7,plot=TRUE) {
## test family derivatives w.r.t. eta
  
  dd <- dDeta(y,mu,wt,theta,fam,deriv=2)
  dev <- fam$dev.resids(y, mu, wt,theta)
  mu1 <- fam$linkinv(fam$linkfun(mu)+eps)
  dev1 <- fam$dev.resids(y,mu1, wt,theta)
  Deta.fd <- (dev1-dev)/eps
  cat("Deta: rdiff = ",range(dd$Deta-Deta.fd)," cor = ",cor(dd$Deta,Deta.fd),"\n")
  plot(dd$Deta,Deta.fd);abline(0,1)
  nt <- length(theta)
  if (fam$n.theta && nt>0) for (i in 1:nt) {
    th1 <- theta;th1[i] <- th1[i] + eps
    dev1 <- fam$dev.resids(y, mu, wt,th1)   
    Dth.fd <- (dev1-dev)/eps
    um <- if (nt>1) dd$Dth[,i] else dd$Dth
    cat("Dth[",i,"]: rdiff = ",range(um-Dth.fd)," cor = ",cor(um,Dth.fd),"\n")
    plot(um,Dth.fd);abline(0,1)
  }
  ## second order up...
  dd1 <- dDeta(y,mu1,wt,theta,fam,deriv=2)
  Deta2.fd <- (dd1$Deta - dd$Deta)/eps
  cat("Deta2: rdiff = ",range(dd$Deta2-Deta2.fd)," cor = ",cor(dd$Deta2,Deta2.fd),"\n")
  plot(dd$Deta2,Deta2.fd);abline(0,1)
  Deta3.fd <- (dd1$Deta2 - dd$Deta2)/eps
  cat("Deta3: rdiff = ",range(dd$Deta3-Deta3.fd)," cor = ",cor(dd$Deta3,Deta3.fd),"\n")
  plot(dd$Deta3,Deta3.fd);abline(0,1)
  Deta4.fd <- (dd1$Deta3 - dd$Deta3)/eps
  cat("Deta4: rdiff = ",range(dd$Deta4-Deta4.fd)," cor = ",cor(dd$Deta4,Deta4.fd),"\n")
  plot(dd$Deta4,Deta4.fd);abline(0,1)
  ## and now the higher derivs wrt theta...
  ind <- 1:nt
  if (fam$n.theta && nt>0) for (i in 1:nt) {
    th1 <- theta;th1[i] <- th1[i] + eps
    dd1 <- dDeta(y,mu,wt,th1,fam,deriv=2)
    Detath.fd <- (dd1$Deta - dd$Deta)/eps 
    um <- if (nt>1) dd$Detath[,i] else dd$Detath
    cat("Detath[",i,"]: rdiff = ",range(um-Detath.fd)," cor = ",cor(um,Detath.fd),"\n")
    plot(um,Detath.fd);abline(0,1)
    Deta2th.fd <- (dd1$Deta2 - dd$Deta2)/eps
    um <- if (nt>1) dd$Deta2th[,i] else dd$Deta2th
    cat("Deta2th[",i,"]: rdiff = ",range(um-Deta2th.fd)," cor = ",cor(um,Deta2th.fd),"\n") 
    plot(um,Deta2th.fd);abline(0,1)
    Deta3th.fd <- (dd1$Deta3 - dd$Deta3)/eps
    um <- if (nt>1) dd$Deta3th[,i] else dd$Deta3th
    cat("Deta3th[",i,"]: rdiff = ",range(um-Deta3th.fd)," cor = ",cor(um,Deta3th.fd),"\n")
    plot(um,Deta3th.fd);abline(0,1)
    ## now the 3 second derivative w.r.t. theta terms

    Dth2.fd <- (dd1$Dth - dd$Dth)/eps
    um <- if (nt>1) dd$Dth2[,ind] else dd$Dth2
    er <- if (nt>1) Dth2.fd[,i:nt] else Dth2.fd
    cat("Dth2[",i,",]: rdiff = ",range(um-er)," cor = ",cor(as.numeric(um),as.numeric(er)),"\n")
    plot(um,er);abline(0,1)
    Detath2.fd <- (dd1$Detath - dd$Detath)/eps
    um <- if (nt>1) dd$Detath2[,ind] else dd$Detath2
    er <- if (nt>1) Detath2.fd[,i:nt] else Detath2.fd
    cat("Detath2[",i,",]: rdiff = ",range(um-er)," cor = ",cor(as.numeric(um),as.numeric(er)),"\n")
    ## cat("Detath2[",i,",]: rdiff = ",range(dd$Detath2-Detath2.fd)," cor = ",cor(dd$Detath2,Detath2.fd),"\n")
    plot(um,er);abline(0,1)
 
    Deta2th2.fd <- (dd1$Deta2th - dd$Deta2th)/eps
    um <- if (nt>1) dd$Deta2th2[,ind] else dd$Deta2th2
    er <- if (nt>1) Deta2th2.fd[,i:nt] else Deta2th2.fd
    cat("Deta2th2[",i,",]: rdiff = ",range(um-er)," cor = ",cor(as.numeric(um),as.numeric(er)),"\n")
    ## cat("Deta2th2[",i,",]: rdiff = ",range(dd$Deta2th2-Deta2th2.fd)," cor = ",cor(dd$Deta2th2,Deta2th2.fd),"\n") 
    ind <- max(ind)+1:(nt-i) 
    plot(um,er);abline(0,1)
  }
} ## fetad.test

corb <- function(x,z) {
## alternative to cor for measuring similarity of x and z,
## which is not scaling invariant. So 1 really means x and z
## are very close, not just linearly related.
  d <- x-z
  1-mean(d^2)/(sd(x)*sd(z))
}

fmud.test <- function(y,mu,wt,theta,fam,eps = 1e-7,plot=TRUE) {
## test family deviance derivatives w.r.t. mu
  ## copy to make debugging easier...
  Dd <- fam$Dd;dev.resids <- fam$dev.resids 
  dd <- Dd(y, mu, theta, wt, level=2) 
  dev <- dev.resids(y, mu, wt,theta)
  dev1 <- dev.resids(y, mu+eps, wt,theta)
  Dmu.fd <- (dev1-dev)/eps
  cat("Dmu: rdiff = ",range(dd$Dmu-Dmu.fd)," cor = ",corb(dd$Dmu,Dmu.fd),"\n")
  if (plot) {
    pch <- 19;cex <- .4
    plot(dd$Dmu,Dmu.fd,pch=pch,cex=cex);abline(0,1,col=2)
    oask <- devAskNewPage(TRUE)
    on.exit(devAskNewPage(oask))
  }
  nt <- length(theta)
  if (fam$n.theta>0&&nt>0) for (i in 1:nt) {
    th1 <- theta;th1[i] <- th1[i] + eps
    dev1 <- dev.resids(y, mu, wt,th1)
    Dth.fd <- (dev1-dev)/eps
    um <- if (nt>1) dd$Dth[,i] else dd$Dth
    cat("Dth[",i,"]: rdiff = ",range(um-Dth.fd)," cor = ",corb(um,Dth.fd),"\n",sep="")
    if (plot) { plot(um,Dth.fd,pch=pch,cex=cex,xlab=paste("Dth[",i,"]",sep=""));
    abline(0,1,col=2)}
  }
  ## second order up...
  dd1 <- Dd(y, mu+eps, theta, wt, level=2)
  Dmu2.fd <- (dd1$Dmu - dd$Dmu)/eps
  cat("Dmu2: rdiff = ",range(dd$Dmu2-Dmu2.fd)," cor = ",corb(dd$Dmu2,Dmu2.fd),"\n")
  if (plot) { plot(dd$Dmu2,Dmu2.fd,pch=pch,cex=cex);abline(0,1,col=2)}
  Dmu3.fd <- (dd1$Dmu2 - dd$Dmu2)/eps
  cat("Dmu3: rdiff = ",range(dd$Dmu3-Dmu3.fd)," cor = ",corb(dd$Dmu3,Dmu3.fd),"\n")
  if (plot) { plot(dd$Dmu3,Dmu3.fd,pch=pch,cex=cex);abline(0,1,col=2)}
  Dmu4.fd <- (dd1$Dmu3 - dd$Dmu3)/eps
  cat("Dmu4: rdiff = ",range(dd$Dmu4-Dmu4.fd)," cor = ",corb(dd$Dmu4,Dmu4.fd),"\n")
  if (plot) { plot(dd$Dmu4,Dmu4.fd,pch=pch,cex=cex);abline(0,1,col=2)}
  ## and now the higher derivs wrt theta 
  ind <- 1:nt
  if (fam$n.theta>0&&nt>0) for (i in 1:nt) {
    th1 <- theta;th1[i] <- th1[i] + eps
    dd1 <- Dd(y, mu, th1, wt, level=2)
    Dmuth.fd <- (dd1$Dmu - dd$Dmu)/eps
    um <- if (nt>1) dd$Dmuth[,i] else dd$Dmuth
    cat("Dmuth[",i,"]: rdiff = ",range(um-Dmuth.fd)," cor = ",corb(um,Dmuth.fd),"\n")
    if (plot) { plot(um,Dmuth.fd,pch=pch,cex=cex,xlab=paste("Dmuth[",i,"]",sep=""));abline(0,1,col=2)}
    Dmu2th.fd <- (dd1$Dmu2 - dd$Dmu2)/eps
    um <- if (nt>1) dd$Dmu2th[,i] else dd$Dmu2th
    cat("Dmu2th[",i,"]: rdiff = ",range(um-Dmu2th.fd)," cor = ",corb(um,Dmu2th.fd),"\n")
    if (plot) { plot(um,Dmu2th.fd,pch=pch,cex=cex,xlab=paste("Dmu2th[",i,"]",sep=""));abline(0,1,col=2)}
    if (!is.null(dd$EDmu2th)) {
       EDmu2th.fd <- (dd1$EDmu2 - dd$EDmu2)/eps
       um <- if (nt>1) dd$EDmu2th[,i] else dd$EDmu2th
       cat("EDmu2th[",i,"]: rdiff = ",range(um-EDmu2th.fd)," cor = ",corb(um,EDmu2th.fd),"\n")
       if (plot) { plot(um,EDmu2th.fd,pch=pch,cex=cex,xlab=paste("EDmu2th[",i,"]",sep=""));abline(0,1,col=2)}
    }
    Dmu3th.fd <- (dd1$Dmu3 - dd$Dmu3)/eps
    um <- if (nt>1) dd$Dmu3th[,i] else dd$Dmu3th
    cat("Dmu3th[",i,"]: rdiff = ",range(um-Dmu3th.fd)," cor = ",corb(um,Dmu3th.fd),"\n")
    if (plot) { plot(um,Dmu3th.fd,pch=pch,cex=cex,xlab=paste("Dmu3th[",i,"]",sep=""));abline(0,1,col=2)}
    ## now the 3 second derivative w.r.t. theta terms...

    Dth2.fd <- (dd1$Dth - dd$Dth)/eps
    um <- if (nt>1) dd$Dth2[,ind] else dd$Dth2
    er <- if (nt>1) Dth2.fd[,i:nt] else Dth2.fd
    cat("Dth2[",i,",]: rdiff = ",range(um-er)," cor = ",corb(as.numeric(um),as.numeric(er)),"\n")
    if (plot) { plot(um,er,pch=pch,cex=cex,xlab=paste("Dth2[",i,",]",sep=""),ylab="fd");abline(0,1,col=2)}
    Dmuth2.fd <- (dd1$Dmuth - dd$Dmuth)/eps
    um <- if (nt>1) dd$Dmuth2[,ind] else dd$Dmuth2
    er <- if (nt>1) Dmuth2.fd[,i:nt] else Dmuth2.fd
    cat("Dmuth2[",i,",]: rdiff = ",range(um-er)," cor = ",corb(as.numeric(um),as.numeric(er)),"\n")
    if (plot) { plot(um,er,pch=pch,cex=cex,xlab=paste("Dmuth2[",i,",]",sep=""),ylab="fd");abline(0,1,col=2)}
    Dmu2th2.fd <- (dd1$Dmu2th - dd$Dmu2th)/eps
    um <- if (nt>1) dd$Dmu2th2[,ind] else dd$Dmu2th2
    er <- if (nt>1) Dmu2th2.fd[,i:nt] else Dmu2th2.fd
    cat("Dmu2th2[",i,",]: rdiff = ",range(um-er)," cor = ",corb(as.numeric(um),as.numeric(er)),"\n")
    if (plot) { plot(um,er,pch=pch,cex=cex,xlab=paste("Dmu2th2[",i,",]",sep=""),ylab="fd");abline(0,1,col=2)}
    ind <- max(ind)+1:(nt-i)
  }
} ## fmud.test


gam.fit4 <- function(x, y, sp, Eb,UrS=list(),
            weights = rep(1, nobs), start = NULL, etastart = NULL, 
            mustart = NULL, offset = rep(0, nobs),U1=diag(ncol(x)), Mp=-1, family = gaussian(), 
            control = gam.control(), deriv=2,gamma=1,
            scale=1,scoreType="REML",null.coef=rep(0,ncol(x)),nei=NULL,...) {
## Routine for fitting GAMs beyond exponential family.
## Inputs as gam.fit3 except that family is of class "extended.family", while
## sp contains the vector of extended family parameters, followed by the log smoothing parameters,
## followed by the log scale parameter if scale < 0

  ## some families have second derivative of deviance, and hence iterative weights
  ## very close to zero for some data. This can lead to poorly scaled sqrt(w)z
  ## and it is better to base everything on wz...
  if (is.null(family$use.wz)) family$use.wz <- FALSE
  warn <- list()
  if (family$n.theta>0) { ## there are extra parameters to estimate
    ind <- 1:family$n.theta
    theta <- sp[ind] ## parameters of the family
    family$putTheta(theta)
    sp <- sp[-ind]   ## log smoothing parameters
  } else theta <- family$getTheta() ## fixed value

  ## penalized <- if (length(UrS)>0) TRUE else FALSE

  if (scale>0) scale.known <- TRUE else {
    ## unknown scale parameter, trial value supplied as 
    ## final element of sp. 
    scale.known <- FALSE
    nsp <- length(sp)
    scale <- exp(sp[nsp])
    sp <- sp[-nsp]
  }
  
  x <- as.matrix(x)  
  nSp <- length(sp) 
  rank.tol <- .Machine$double.eps*100 ## tolerance to use for rank deficiency
  q <- ncol(x)
  n <- nobs <- nrow(x)  
  
  xnames <- dimnames(x)[[2]]
  ynames <- if (is.matrix(y)) rownames(y) else names(y)
  ## Now a stable re-parameterization is needed....

  if (length(UrS)) {
      grderiv <- if (scoreType=="EFS") 1 else deriv 
      rp <- gam.reparam(UrS,sp,grderiv)
      T <- diag(q)
      T[1:ncol(rp$Qs),1:ncol(rp$Qs)] <- rp$Qs
      T <- U1%*%T ## new params b'=T'b old params
    
      null.coef <- t(T)%*%null.coef  
     
      if (!is.null(start)) start <- t(T)%*%start

      ## form x%*%T in parallel 
      x <- .Call(C_mgcv_pmmult2,x,T,0,0,control$nthreads)
      rS <- list()
      for (i in 1:length(UrS)) {
        rS[[i]] <- rbind(rp$rS[[i]],matrix(0,Mp,ncol(rp$rS[[i]])))
      } ## square roots of penalty matrices in current parameterization
      Eb <- Eb%*%T ## balanced penalty matrix
      rows.E <- q-Mp
      Sr <- cbind(rp$E,matrix(0,nrow(rp$E),Mp))
      St <- rbind(cbind(rp$S,matrix(0,nrow(rp$S),Mp)),matrix(0,Mp,q))
  } else {
      grderiv <- 0
      T <- diag(q); 
      St <- matrix(0,q,q) 
      rSncol <- rows.E <- Eb <- Sr <- 0   
      rS <- list(0)
      rp <- list(det=0,det1 = 0,det2 = 0,fixed.penalty=FALSE)
  }

  ## re-parameterization complete. Initialization....

  nvars <- ncol(x)
  if (nvars==0) stop("emtpy models not available")
  if (is.null(weights)) weights <- rep.int(1, nobs)
  if (is.null(offset)) offset <- rep.int(0, nobs)

  linkinv <- family$linkinv
  valideta <- family$valideta
  validmu <- family$validmu
  dev.resids <- family$dev.resids

  ## need an initial `null deviance' to test for initial divergence...
  ## if (!is.null(start)) null.coef <- start - can be on edge of feasible - not good
  null.eta <- as.numeric(x%*%null.coef + as.numeric(offset))

  ## call the families initialization code...

  if (is.null(mustart)) {
    eval(family$initialize)
    mukeep <- NULL
  } else {
    mukeep <- mustart
    eval(family$initialize)
    #mustart <- mukeep
  }

  old.pdev <- sum(dev.resids(y, linkinv(null.eta), weights,theta)) + t(null.coef)%*%St%*%null.coef 

  if (!is.null(start)) { ## check it's at least better than null.coef
    pdev <- sum(dev.resids(y, linkinv(x%*%start+as.numeric(offset)), weights,theta)) + t(start)%*%St%*%start
    if (pdev>old.pdev) start <- mukeep <- etastart <- NULL
  }
  coefold <- null.coef ## set to default, may be replaced below
  etaold <- x %*% coefold + offset
  if (!is.null(mukeep)) mustart <- mukeep

  ## and now finalize initialization of mu and eta...

  eta <- if (!is.null(etastart)) etastart
         else if (!is.null(start)) 
              if (length(start) != nvars) 
                  stop("Length of start should equal ", nvars, 
                  " and correspond to initial coefs for ", deparse(xnames))
              else {
                  coefold <- start
                  etaold <- offset + as.vector(if (NCOL(x) == 1) 
                  x * start
                  else x %*% start)
              }
              else family$linkfun(mustart)
  
   mu <- linkinv(eta)
   conv <-  boundary <- FALSE
   dd <- dDeta(y,mu,weights,theta,family,0) ## derivatives of deviance w.r.t. eta
   w <- dd$Deta2 * .5
   wz <- w*(eta-offset) - .5*dd$Deta
   z <- (eta-offset) - dd$Deta.Deta2
   good <- is.finite(z)&is.finite(w)
   zg <- rep(0,max(dim(x)))

   for (iter in 1:control$maxit) { ## start of main fitting iteration 
      if (control$trace) cat(iter," ")
      if (control$trace&sum(!good)>0) cat("\n",sum(!good)," not good\n")
      if (sum(!good)) {
        use.wy <- TRUE
        good <- is.finite(w)&is.finite(wz)
        z[!is.finite(z)] <- 0 ## avoid NaN in .C call - unused anyway
      } else use.wy <- family$use.wz
      if (sum(good)==0) stop("no good data in iteration")
      ng <- sum(good)
      zg[1:ng] <- z[good] ## ensure that y dimension large enough for coefs
      oo <- .C(C_pls_fit1,   
               y=as.double(zg),X=as.double(x[good,]),w=as.double(w[good]),wy = as.double(wz[good]),
                     E=as.double(Sr),Es=as.double(Eb),n=as.integer(ng),
                     q=as.integer(ncol(x)),rE=as.integer(rows.E),eta=as.double(z),
                     penalty=as.double(1),rank.tol=as.double(rank.tol),
                     nt=as.integer(control$nthreads),use.wy=as.integer(use.wy))		     
      posdef <- oo$n >= 0
      if (!posdef) { ## then problem is indefinite - switch to +ve weights for this step
        if (control$trace) cat("**using positive weights\n")
        # problem is that Fisher can be very poor for zeroes  

        ## index weights that are finite and positive 
        good <- is.finite(dd$Deta2)
        good[good] <- dd$Deta2[good]>0 
        w[!good] <- 0
        wz <- w*(eta-offset) - .5*dd$Deta
        z <- (eta-offset) - dd$Deta.Deta2
        good <- is.finite(z)&is.finite(w) 
        if (sum(!good)) {
          use.wy <- TRUE
          good <- is.finite(w)&is.finite(wz)
          z[!is.finite(z)] <- 0 ## avoid NaN in .C call - unused anyway
        } else use.wy <- family$use.wz
        ng <- sum(good)
        zg[1:ng] <- z[good] ## ensure that y dimension large enough for coefs     
        oo <- .C(C_pls_fit1, ##C_pls_fit1,
                  y=as.double(zg),X=as.double(x[good,]),w=as.double(w[good]),wy = as.double(wz[good]),
                     E=as.double(Sr),Es=as.double(Eb),n=as.integer(ng),
                     q=as.integer(ncol(x)),rE=as.integer(rows.E),eta=as.double(z),
                     penalty=as.double(1),rank.tol=as.double(rank.tol),
                     nt=as.integer(control$nthreads),use.wy=as.integer(use.wy))
      }
      start <- oo$y[1:ncol(x)] ## current coefficient estimates
      penalty <- oo$penalty ## size of penalty

      eta <- drop(x%*%start) ## the linear predictor (less offset)

      if (any(!is.finite(start))) { ## test for breakdown
          conv <- FALSE
          warn[[length(warn)+1]] <- paste(" gam.fit4 non-finite coefficients at iteration ", 
                  iter)
          return(list(REML=NA,warn=warn)) ## return immediately signalling failure
      }        
     
      mu <- linkinv(eta <- eta + offset)
      dev <- sum(dev.resids(y, mu, weights,theta)) 

      ## now step halve under non-finite deviance...
      if (!is.finite(dev)) {
         if (is.null(coefold)) {
            if (is.null(null.coef)) 
              stop("no valid set of coefficients has been found:please supply starting values", 
                    call. = FALSE)
            ## Try to find feasible coefficients from the null.coef and null.eta
            coefold <- null.coef
            etaold <- null.eta
         }
        
         ii <- 1
         while (!is.finite(dev)) {
               if (ii > control$maxit) 
                    stop("inner loop 1; can't correct step size")
               ii <- ii + 1
               start <- (start + coefold)/2
               eta <- (eta + etaold)/2               
               mu <- linkinv(eta)
               dev <- sum(dev.resids(y, mu, weights,theta))
              
         }
         boundary <- TRUE
         penalty <- t(start)%*%St%*%start ## reset penalty too
         if (control$trace) 
                  cat("Step halved: new deviance =", dev, "\n")
      } ## end of infinite deviance correction

      ## now step halve if mu or eta are out of bounds... 
      if (!(valideta(eta) && validmu(mu))) {
       
         ii <- 1
         while (!(valideta(eta) && validmu(mu))) {
                  if (ii > control$maxit) 
                    stop("inner loop 2; can't correct step size")
                  ii <- ii + 1
                  start <- (start + coefold)/2
                  eta <- (eta + etaold)/2 
                  mu <- linkinv(eta)
         }
         boundary <- TRUE
         dev <- sum(dev.resids(y, mu, weights,theta))
         penalty <- t(start)%*%St%*%start ## need to reset penalty too
         if (control$trace) 
                  cat("Step halved: new deviance =", dev, "\n")
      } ## end of invalid mu/eta handling

      ## now check for divergence of penalized deviance....
  
      pdev <- dev + penalty  ## the penalized deviance 
      if (control$trace) cat("penalized deviance =", pdev, "\n")
     
      div.thresh <- 10*(.1+abs(old.pdev))*.Machine$double.eps^.5

      if (pdev-old.pdev>div.thresh) { ## solution diverging
         ii <- 1 ## step halving counter
         if (iter==1) { ## immediate divergence, need to shrink towards zero 
               etaold <- null.eta; coefold <- null.coef
         }
         while (pdev -old.pdev > div.thresh)  { ## step halve until pdev <= old.pdev
           if (ii > 100) 
              stop("inner loop 3; can't correct step size")
           ii <- ii + 1
           start <- (start + coefold)/2 
           eta <- (eta + etaold)/2               
           mu <- linkinv(eta)
           dev <- sum(dev.resids(y, mu, weights,theta))

           penalty <-  t(start)%*%St%*%start
           pdev <- dev + penalty ## the penalized deviance
           if (control$trace) 
                  cat("Step halved: new penalized deviance =", pdev, "\n")
        }
     } ## end of pdev divergence

     if (scoreType=="EFS"&&family$n.theta>0) { ## there are theta parameters to estimate...
       scale1 <- if (!is.null(family$scale)) family$scale else scale
       if (family$n.theta>0||scale<0) theta <- estimate.theta(theta,family,y,mu,scale=scale1,wt=weights,tol=1e-7)
       if (!is.null(family$scale) && family$scale<0) {
	  scale <- exp(theta[family$n.theta+1])
	  theta <- theta[1:family$n.theta]
       }  
       family$putTheta(theta)
     }
     ## get new weights and pseudodata (needed now for grad testing)...
     dd <- dDeta(y,mu,weights,theta,family,0) ## derivatives of deviance w.r.t. eta
     w <- dd$Deta2 * .5;
     wz <- w*(eta-offset) - .5*dd$Deta
     z <- (eta-offset) - dd$Deta.Deta2
     good <- is.finite(z)&is.finite(w) 
     ## convergence testing...
     if (posdef && abs(pdev - old.pdev)/(0.1 + abs(pdev)) < control$epsilon) {
       ## Need to check coefs converged adequately, to ensure implicit differentiation
       ## ok. Testing coefs unchanged is problematic under rank deficiency (not guaranteed to
       ## drop same parameter every iteration!)
       grad <- 2 * t(x[good,,drop=FALSE])%*%((w[good]*(x%*%start)[good]-wz[good]))+ 2*St%*%start 
       if (max(abs(grad)) > control$epsilon*(abs(pdev)+scale)) {
         old.pdev <- pdev  ## not converged quite enough
         coef <- coefold <- start
         etaold <- eta 
         ##muold <- mu
       } else { ## converged
         conv <- TRUE
         coef <- start
         break 
       }
     } else { ## not converged
       old.pdev <- pdev
       coef <- coefold <- start
       etaold <- eta 
     }
     if (scoreType=="EFS"&&family$n.theta>0) { 
       ## now recompute pdev with new theta, otherwise step control won't work at next iteration
       dev <- sum(dev.resids(y, mu, weights,theta))
       old.pdev <- pdev <- dev + penalty
     }
   } ## end of main loop
   
   ## so at this stage the model has been fully estimated
   coef <- as.numeric(T %*% coef)
 
   ## now obtain derivatives, if these are needed...
   check.derivs <- FALSE
   while (check.derivs) { ## debugging code to check derivatives
     eps <- 1e-7
     fmud.test(y,mu,weights,theta,family,eps = eps)
     fetad.test(y,mu,weights,theta,family,eps = eps)
   }   

   dd <- dDeta(y,mu,weights,theta,family,deriv)
   w <- dd$Deta2 * .5
   z <- (eta-offset) - dd$Deta.Deta2 ## - .5 * dd$Deta[good] / w
   wf <- pmax(0,dd$EDeta2 * .5) ## Fisher type weights 
   wz <- w*(eta-offset) - 0.5*dd$Deta ## Wz finite when w==0
  
   good <- is.finite(wz)&is.finite(w)&dd$good
   if (sum(good)==0) stop("not enough finite derivatives")
      
   residuals <- z - (eta - offset)
   residuals[!is.finite(residuals)] <- NA 
   z[!is.finite(z)] <- 0 ## avoid passing NA etc to C code  

   ntot <- length(theta) + length(sp)
   rSncol <- unlist(lapply(UrS,ncol))
   ## Now drop any elements of dd that have been dropped in fitting...
   if (sum(!good)>0) { ## drop !good from fields of dd, weights and pseudodata
     z <- z[good]; w <- w[good]; wz <- wz[good]; wf <- wf[good]
     dd$Deta <- dd$Deta[good];dd$Deta2 <- dd$Deta2[good] 
     dd$EDeta2 <- dd$EDeta2[good]
     if (deriv>0) dd$Deta3 <- dd$Deta3[good]
     if (deriv>1) dd$Deta4 <- dd$Deta4[good]
     if (length(theta)>1) {
         if (deriv>0) {  
         dd$Dth <- dd$Dth[good,]; 
         dd$Detath <- dd$Detath[good,]; dd$Deta2th <- dd$Deta2th[good,]
         if (deriv>1) {  
           dd$Detath2 <- dd$Detath2[good,]; dd$Deta3th <- dd$Deta3th[good,]
           dd$Deta2th2 <- dd$Deta2th2[good,];dd$Dth2 <- dd$Dth2[good,]
         }
       }
     } else {
       if (deriv>0) { 
         dd$Dth <- dd$Dth[good]; 
         dd$Detath <- dd$Detath[good]; dd$Deta2th <- dd$Deta2th[good]
         if (deriv>1) {
           dd$Detath2 <- dd$Detath2[good]; dd$Deta3th <- dd$Deta3th[good]
           dd$Deta2th2 <- dd$Deta2th2[good]; dd$Dth2 <- dd$Dth2[good]
         }
       } 
     }
   }

   ## gdi.type should probably be computed after dropping via good
   
   gdi.type <- if (any(abs(w)<.Machine$double.xmin*1e20)||any(!is.finite(z))) 1 else 0   

   if (scoreType=="EFS") scoreType <- "REML"
   oo <- .C(C_gdi2,
            X=as.double(x[good,]),E=as.double(Sr),Es=as.double(Eb),rS=as.double(unlist(rS)),
            U1 = as.double(U1),sp=as.double(exp(sp)),theta=as.double(theta),
            z=as.double(z),w=as.double(w),wz=as.double(wz),wf=as.double(wf),Dth=as.double(dd$Dth),
            Det=as.double(dd$Deta),
            Det2=as.double(dd$Deta2),Dth2=as.double(dd$Dth2),Det.th=as.double(dd$Detath),
            Det2.th=as.double(dd$Deta2th),Det3=as.double(dd$Deta3),Det.th2 = as.double(dd$Detath2),
            Det4 = as.double(dd$Deta4),Det3.th=as.double(dd$Deta3th), Deta2.th2=as.double(dd$Deta2th2),
            beta=as.double(coef),b1=as.double(rep(0,ntot*ncol(x))),w1=as.double(rep(0,ntot*length(z))),
            D1=as.double(rep(0,ntot)),D2=as.double(rep(0,ntot^2)),
            P=as.double(0),P1=as.double(rep(0,ntot)),P2 = as.double(rep(0,ntot^2)),
            ldet=as.double(1-2*(scoreType=="ML")),ldet1 = as.double(rep(0,ntot)), 
            ldet2 = as.double(rep(0,ntot^2)),
            rV=as.double(rep(0,ncol(x)^2)),
            rank.tol=as.double(.Machine$double.eps^.75),rank.est=as.integer(0),
	    n=as.integer(sum(good)),q=as.integer(ncol(x)),M=as.integer(nSp),
            n.theta=as.integer(length(theta)), Mp=as.integer(Mp),Enrow=as.integer(rows.E),
            rSncol=as.integer(rSncol),deriv=as.integer(deriv),
	    fixed.penalty = as.integer(rp$fixed.penalty),nt=as.integer(control$nthreads),
            type=as.integer(gdi.type),dVkk=as.double(rep(0,nSp^2)))

   rV <- matrix(oo$rV,ncol(x),ncol(x)) ## rV%*%t(rV)*scale gives covariance matrix 
   #rV <- rV # transform before return   
   ## derivatives of coefs w.r.t. sps etc...

   ## note that db.drho and dw.drho start with derivs wrt theta, then wrt sp (no scale param of course) 

   db.drho <- if (deriv) matrix(oo$b1,ncol(x),ntot) else NULL ## transform before return

   dw.drho <- if (deriv) matrix(oo$w1,length(z),ntot) else NULL

   Kmat <- matrix(0,nrow(x),ncol(x)) 
   Kmat[good,] <- oo$X                    ## rV%*%t(K)%*%(sqrt(wf)*X) = F; diag(F) is edf array 

   Vg <- NCV <- NCV1 <- REML <- REML1 <- REML2 <- NULL
   if (scoreType=="NCV") {
     eta.cv <- rep(0.0,length(nei$i))
     deta.cv <- if (deriv) matrix(0.0,length(nei$i),ntot) else matrix(0.0,1,ntot)
     w1 <- -dd$Deta/2; w2 <- dd$Deta2/2; dth <- dd$Detath/2 ## !?
     R <- try(chol(crossprod(x,w*x)+St),silent=TRUE)
     if (nei$jackknife > 2) { ## return NCV coef changes for each fold 
       if (deriv>0) stop("jackknife and derivatives requested together")
       dth <- matrix(0,ncol(x),length(nei$m))
       deriv1 <- -1 ## signal to return coef changes
     } else deriv1 <- deriv
     if (inherits(R,"try-error")) { ## use CG approach...
	Hi <- tcrossprod(rV) ## inverse of penalized Expected Hessian - inverse actual Hessian probably better
        cg.iter <- .Call(C_ncv,x,Hi,w1,w2,db.drho,dw.drho,rS,nei$i-1,nei$mi,nei$m,nei$k-1,oo$beta,exp(sp),eta.cv, deta.cv, dth, deriv1);
	warn[[length(warn)+1]] <- "NCV positive definite update check not possible"
     } else { ## use Cholesky update approach
	pdef.fails <- .Call(C_Rncv,x,R,w1,w2,db.drho,dw.drho,rS,nei$i-1,nei$mi,nei$m,nei$k-1,oo$beta,exp(sp),eta.cv,
	                    deta.cv, dth, deriv1,.Machine$double.eps,control$ncv.threads);
	if (pdef.fails) warn[[length(warn)+1]] <- "some NCV updates not positive definite"
     }   
     mu.cv <- linkinv(eta.cv)
     nt <- family$n.theta
     ## if some elements of theta are fixed, figure out which derivatives will be needed...
     if (deriv) keep <- if (length(theta)>nt) (length(theta)+1):(ncol(db.drho)+!scale.known) else 1:(ncol(db.drho)+!scale.known)
   
     dev0 <- dev.resids(y[nei$i], mu[nei$i], weights[nei$i],theta)
     ls0 <- family$ls(y[nei$i],weights[nei$i],theta,scale)
     if (family$qapprox) { ## quadratic approximation to NCV
       qdev <- dev0 + gamma*dd$Deta[nei$i]*(eta.cv-eta[nei$i]) + 0.5*gamma*dd$Deta2[nei$i]*(eta.cv-eta[nei$i])^2
       NCV <- sum(qdev)/(2*scale) - ls0$ls
       if (deriv) {
         deta <- x %*% db.drho
	 ncv1 <- (dd$Deta[nei$i]*((1-gamma)*deta[nei$i,,drop=FALSE]+gamma*deta.cv) +
	          gamma*dd$Deta2[nei$i]*deta.cv*(eta.cv-eta[nei$i]) +
	          0.5*gamma*as.numeric(dd$Deta3[nei$i])*deta[nei$i,,drop=FALSE]*(eta.cv-eta[nei$i])^2)/(2*scale)
	 if (nt>0) { ## deal with direct dependence on the theta parameters
           ncv1[,1:nt] <- ncv1[,1:nt]- ls0$LSTH1[,1:nt] +
	      if (nt==1) (dd$Dth[nei$i] + gamma*dd$Detath[nei$i]*(eta.cv-eta[nei$i]) + 0.5*gamma*dd$Deta2th[nei$i]*(eta.cv-eta[nei$i])^2)/(2*scale)
	      else (dd$Dth[nei$i,] + gamma*dd$Detath[nei$i,]*(eta.cv-eta[nei$i]) + 0.5*gamma*dd$Deta2th[nei$i,]*(eta.cv-eta[nei$i])^2)/(2*scale)
         }
	 if (!scale.known) {
	   ncv1 <- cbind(ncv1,-qdev/(2*scale) - ls0$LSTH1[,1+nt])
         }
       }
     } else { ## exact NCV
       dev.cv <- dev.resids(y, mu.cv, weights,theta)
       NCV <- sum(dev.cv)/(2*scale) - ls0$ls
       DEV <- sum(dev0)/(2*scale) - ls0$ls 
       if (gamma!=1) NCV <- gamma*NCV - (gamma-1)*DEV
       if (deriv) {
         dd.cv <- dDeta(y[nei$i],mu.cv,weights[nei$i],theta,family,1) 
	 ncv1 <- dd.cv$Deta*deta.cv/(2*scale)
	 if (gamma!=1) dev1 <- (dd$Deta*(x%*%db.drho))[nei$i,,drop=FALSE]/(2*scale)
         if (nt>0) {
       	   ncv1[,1:nt] <- ncv1[,1:nt] + as.matrix(dd.cv$Dth/(2*scale)) - ls0$LSTH1[,1:nt]
	   if (gamma!=1) dev1[,1:nt] <- dev1[,1:nt] + as.matrix(dd$Dth/(2*scale))[nei$i,,drop=FALSE] - ls0$LSTH1[,1:nt]
         }
         if (!scale.known) { ## deal with log scale parameter derivative
	   ncv1 <- cbind(ncv1,-dev.cv/(2*scale) - ls0$LSTH1[,1+nt])
	   if (gamma!=1) dev1 <- cbind(dev1,-dev0/(2*scale) - ls0$lSTH1[,1+nt])
         }
	 if (gamma!=1) ncv1 <- gamma*ncv1 - (gamma-1)*dev1
       }
     } ## exact NCV
     
     if (nei$jackknife>2) { 
       nk <- c(nei$m[1],diff(nei$m)) ## dropped fold sizes
       jkw <- sqrt((nobs-nk)/(nobs*nk)) ## jackknife weights
       dth <-jkw*t(dth)%*%t(T)
       #Vj <- crossprod(dd) ## jackknife cov matrix for coefs (beta)
       #attr(Vj,"dd") <- dd
       #attr(NCV,"Vj") <- Vj
       attr(NCV,"dd") <- dth
     }  

     attr(NCV,"eta.cv") <- eta.cv
     if (deriv) {
       attr(NCV,"deta.cv") <- deta.cv;
       NCV1 <- colSums(ncv1)
       NCV1 <- NCV1[keep] ## drop derivatives for any fixed theta parameters
       Vg <- crossprod(ncv1[,keep,drop=FALSE]) ## empirical cov matrix of grad
     }  
   } else { ## RE/ML
    
     D2 <- matrix(oo$D2,ntot,ntot); ldet2 <- matrix(oo$ldet2,ntot,ntot)
     bSb2 <- matrix(oo$P2,ntot,ntot)
     ## compute the REML score...
     ls <- family$ls(y,weights,theta,scale)
     nt <- length(theta)
     lsth1 <- if (nt>0) ls$lsth1[1:nt] else rep(0,0)
     lsth2 <- if (nt>0) as.matrix(ls$lsth2)[1:nt,1:nt] else matrix(0,0,0) ## exclude any derivs w.r.t log scale here
     REML <- ((dev+oo$P)/(2*scale) - ls$ls)/gamma + (oo$ldet - rp$det)/2 - 
             as.numeric(scoreType=="REML") * Mp * (log(2*pi*scale)/2-log(gamma)/2)
     
     if (deriv) {
       det1 <- oo$ldet1
       if (nSp) {
         ind <- 1:nSp + length(theta)
         det1[ind] <- det1[ind] - rp$det1
       }
       REML1 <- ((oo$D1+oo$P1)/(2*scale) - c(lsth1,rep(0,length(sp))))/gamma + (det1)/2
       if (deriv>1) {
         ls2 <- D2*0;if (nt>0) ls2[1:nt,1:nt] <- lsth2 
         if (nSp) ldet2[ind,ind] <- ldet2[ind,ind] - rp$det2
         REML2 <- ((D2+bSb2)/(2*scale) - ls2)/gamma + ldet2/2
       }
     } 

     if (!scale.known&&deriv) { ## need derivatives wrt log scale, too 
        Dp <- dev + oo$P
        dlr.dlphi <- (-Dp/(2 *scale) - ls$lsth1[nt+1])/gamma - as.numeric(scoreType=="REML") * Mp/2
        d2lr.d2lphi <- (Dp/(2*scale) - ls$lsth2[nt+1,nt+1])/gamma 
        d2lr.dspphi <- -(oo$D1+oo$P1)/(2*scale*gamma) 
        if (nt>0) d2lr.dspphi[1:nt] <- d2lr.dspphi[1:nt] - ls$lsth2[nt+1,1:nt]/gamma
        REML1 <- c(REML1,dlr.dlphi)
        if (deriv==2) {
              REML2 <- rbind(REML2,as.numeric(d2lr.dspphi))
              REML2 <- cbind(REML2,c(as.numeric(d2lr.dspphi),d2lr.d2lphi))
        }
     }
   }
   nth <- length(theta)

   if (deriv) db.drho <- T %*% db.drho

   if (deriv>0&&family$n.theta==0&&nth>0) { ## need to drop derivs for fixed theta
     REML1 <- REML1[-(1:nth)]
     if (deriv>1) REML2 <- REML2[-(1:nth),-(1:nth)]
     db.drho <- db.drho[,-(1:nth),drop=FALSE]
   }  

   names(coef) <- xnames
   names(residuals) <- ynames
   wtdmu <- sum(weights * y)/sum(weights) ## has to then be corrected when this is incorrect
   ## wtdmu <- sum(weights * mu)/sum(weights) ## changed from y
   nulldev <- sum(dev.resids(y, rep(wtdmu,length(y)), weights)) ## this will be corrected in family postproc
   n.ok <- nobs - sum(weights == 0)
   nulldf <- n.ok
   ww <- wt <- rep.int(0, nobs)
   wt[good] <- wf 
   ww[good] <- w
   if (deriv && nrow(dw.drho)!=nrow(x)) {
      w1 <- dw.drho
      dw.drho <- matrix(0,nrow(x),ncol(w1))
      dw.drho[good,] <- w1
   }
   ## dev is used in family$aic to estimate the scale as dev/sum(weights), or is unused.
   ## the form is for compatibility with exponential families, but the estimator can
   ## be biased for low count data in the extended family case. So better to force the
   ## actual scale estimate to be used...
   aic.model <- family$aic(y, mu, theta, weights, dev=scale*sum(weights)) # note: incomplete 2*edf needs to be added
 

   list(coefficients = coef,residuals=residuals,fitted.values = mu,
        family=family, linear.predictors = eta,deviance=dev,
        null.deviance=nulldev,iter=iter,
        weights=wt, ## note that these are Fisher type weights 
        prior.weights=weights,
        working.weights = ww, ## working weights
        df.null = nulldf, y = y, converged = conv,z=z,
        boundary = boundary,
        REML=REML,REML1=REML1,REML2=REML2,NCV=NCV,NCV1=NCV1,
        rV=T %*% rV,db.drho=db.drho,dw.drho=dw.drho,
        scale.est=scale,reml.scale=scale,
        aic=aic.model,
        rank=oo$rank.est,
        K=Kmat,control=control,Vg=Vg,
        dVkk = matrix(oo$dVkk,nSp,nSp),ldetS1 = if (grderiv) rp$det1 else 0
        #,D1=oo$D1,D2=D2,
        #ldet=oo$ldet,ldet1=oo$ldet1,ldet2=ldet2,
        #bSb=oo$P,bSb1=oo$P1,bSb2=bSb2,
        #ls=ls$ls,ls1=ls$lsth1,ls2=ls$lsth2
       )
 
} ## gam.fit4


efsudr <- function(x,y,lsp,Eb,UrS,weights,family,offset=0,start=NULL,etastart=NULL,mustart=NULL,
                   U1=diag(ncol(x)), intercept = TRUE,scale=1,Mp=-1,control=gam.control(),n.true=-1,...) {
## Extended Fellner-Schall method for regular and extended families,
## with PIRLS performed by gam.fit3/4.
## tr(S^-S_j) is returned by ldetS1, rV %*% t(rV)*scale is 
## cov matrix. I think b'S_jb will need to be  computed here.
  nsp <- length(UrS)
  if (inherits(family,"extended.family")) {
    spind <- family$n.theta + 1:nsp
    thind <- if (family$n.theta>0) 1:family$n.theta else rep(0,0)
  } else {
    thind <- rep(0,0)
    spind <- 1:nsp ## index of smoothing params in lsp
  }
  estimate.scale <- (length(lsp)>max(spind))
  lsp[spind] <- lsp[spind] + 2.5 
  mult <- 1
  fit <- gam.fit3(x=x, y=y, sp=lsp, Eb=Eb,UrS=UrS,
            weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family, 
            control = control, intercept = intercept,deriv=0,
            gamma=1,scale=scale,scoreType="EFS",
            n.true=n.true,...)
  if (length(thind)>0) lsp[thind] <- family$getTheta()
  if (estimate.scale) lsp[length(lsp)] <- log(fit$scale)
  ## Also need scale estimate. OK from gam.fit3, but gam.fit4 version probably needs correcting
  ## for edf, as obtained via MLE.
  p <- ncol(x)
  n <- nrow(x)
  score.hist <- rep(0,200)
  
  bSb <- trVS <- rep(0,nsp)
  for (iter in 1:200) {
    start <- fit$coefficients
    Y <- U1[,1:(ncol(U1)-Mp)] ## penalty range space
    ## project coefs and rV to Y, since this is space of UrS[[i]]
    Yb <- drop(t(Y)%*%start) 
    rV <- t(fit$rV) ## so t(rV)%*%rV*scale is cov matrix
    rVY <- rV %*% Y
    ## ith penalty is UrS[[i]]%*%t(UrS[[i]])...
    for (i in 1:length(UrS)) {
      xx <- Yb %*% UrS[[i]] 
      bSb[i] <- sum(xx^2)
      xx <- rVY %*% UrS[[i]]
      trVS[i] <- sum(xx^2)
    }
    edf <- p - sum(trVS*exp(lsp[spind]))
    if (inherits(family,"extended.family")&&estimate.scale) {
      fit$scale <- fit$scale*n/(n-edf) ## correct for edf.
    }

    a <- pmax(0,fit$ldetS1*exp(-lsp[spind]) - trVS) ## NOTE: double check scaling here
    phi <- if (estimate.scale) fit$scale else scale
    r <- a/pmax(0,bSb)*phi
    r[a==0&bSb==0] <- 1
    r[!is.finite(r)] <- 1e6
    lsp1 <- lsp
    lsp1[spind] <- pmin(lsp[spind] + log(r)*mult,control$efs.lspmax)
    max.step <- max(abs(lsp1-lsp))
    old.reml <- fit$REML
    fit <- gam.fit3(x=x, y=y, sp=lsp1, Eb=Eb,UrS=UrS,
            weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family, 
            control = control, intercept = intercept,deriv=0,mustart=mustart,
            gamma=1,scale=scale,scoreType="EFS",
            n.true=n.true,...)
    if (length(thind)>0) lsp1[thind] <- family$getTheta()
    if (estimate.scale) lsp1[length(lsp)] <- log(fit$scale)
    ## some step length control...
   
    if (fit$REML<=old.reml) { ## improvement
      if (max.step<.05) { ## consider step extension (near optimum)
        lsp2 <- lsp
        lsp2[spind] <- pmin(lsp[spind] + log(r)*mult*2,control$efs.lspmax) ## try extending step...
        fit2 <- gam.fit3(x=x, y=y, sp=lsp2, Eb=Eb,UrS=UrS,
            weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family, 
            control = control, intercept = intercept,deriv=0,mustart=mustart,
            gamma=1,scale=scale,scoreType="EFS",
            n.true=n.true,...)
        if (length(thind)>0) lsp2[thind] <- family$getTheta()
        if (estimate.scale) lsp2[length(lsp)] <- log(fit$scale)
        if (fit2$REML < fit$REML) { ## improvement - accept extension
          fit <- fit2;lsp <- lsp2
	  mult <- mult * 2
        } else { ## accept old step
          lsp <- lsp1
        }
      } else lsp <- lsp1
    } else { ## no improvement 
      while (fit$REML > old.reml&&mult>1) { ## don't contract below 1 as update doesn't have to improve REML 
          mult <- mult/2 ## contract step
	  lsp1 <- lsp
          lsp1[spind] <- pmin(lsp[spind] + log(r)*mult,control$efs.lspmax)
	  fit <- gam.fit3(x=x, y=y, sp=lsp1, Eb=Eb,UrS=UrS,
            weights = weights, start = start, offset = offset,U1=U1, Mp=Mp, family = family, 
            control = control, intercept = intercept,deriv=0,mustart=mustart,
            gamma=1,scale=scale,scoreType="EFS",
            n.true=n.true,...)
	  if (length(thind)>0) lsp1[thind] <- family$getTheta()
          if (estimate.scale) lsp1[length(lsp)] <- log(fit$scale)
      }
      lsp <- lsp1
      if (mult<1) mult <- 1
    }
    score.hist[iter] <- fit$REML
    ## break if EFS step small and REML change negligible over last 3 steps.
    if (iter>3 && max.step<.05 && max(abs(diff(score.hist[(iter-3):iter])))<control$efs.tol) break
    ## or break if deviance not changing...
    if (iter==1) old.dev <- fit$dev else {
      if (abs(old.dev-fit$dev) < 100*control$eps*abs(fit$dev)) break
      old.dev <- fit$dev
    }
  }
  fit$sp <- exp(lsp)
  fit$iter <- iter
  fit$outer.info <- list(iter = iter,score.hist=score.hist[1:iter])
  fit$outer.info$conv <- if (iter==200) "iteration limit reached" else "full convergence"
  fit
} ## efsudr


gam.fit5 <- function(x,y,lsp,Sl,weights=NULL,offset=NULL,deriv=2,family,scoreType="REML",
                     control=gam.control(),Mp=-1,start=NULL,gamma=1,nei=NULL){
## NOTE: offset handling - needs to be passed to ll code
## fit models by general penalized likelihood method, 
## given doubly extended family in family. lsp is log smoothing parameters
## Stabilization strategy:
## 1. Sl.repara
## 2. Hessian diagonally pre-conditioned if +ve diagonal elements
##    (otherwise indefinite anyway)
## 3. Newton fit with perturbation of any indefinite hessian
## 4. At convergence test fundamental rank on balanced version of 
##    penalized Hessian. Drop unidentifiable parameters and 
##    continue iteration to adjust others.
## 5. All remaining computations in reduced space.
##    
## Idea is that rank detection takes care of structural co-linearity,
## while preconditioning and step 1 take care of extreme smoothing parameters
## related problems. 

  penalized <- if (length(Sl)>0) TRUE else FALSE
  warn <- list()
  nSp <- length(lsp)
  q <- ncol(x)
  nobs <- length(y)
  
  if (penalized) {
    Eb <- attr(Sl,"E") ## balanced penalty sqrt
 
    ## the stability reparameterization + log|S|_+ and derivs... 
    rp <- ldetS(Sl,rho=lsp,fixed=rep(FALSE,length(lsp)),np=q,root=TRUE) 
    x <- Sl.repara(rp$rp,x) ## apply re-parameterization to x
    Eb <- Sl.repara(rp$rp,Eb) ## root balanced penalty
    St <- crossprod(rp$E) ## total penalty matrix
    E <- rp$E ## root total penalty
    attr(E,"use.unscaled") <- TRUE ## signal initialization code that E not to be further scaled   
    if (!is.null(start)) start  <- Sl.repara(rp$rp,start) ## re-para start
    ## NOTE: it can be that other attributes need re-parameterization here
    ##       this should be done in 'family$initialize' - see mvn for an example. 

  } else { ## unpenalized so no derivatives required
    deriv <- 0 
    rp <- list(ldetS=0,rp=list())
    St <- matrix(0,q,q)
    E <- matrix(0,0,q) ## can be needed by initialization code
  }
  ## now call initialization code, but make sure that any 
  ## supplied 'start' vector is not overwritten...
  start0 <- start
  
  ## Assumption here is that the initialization code is fine with
  ##  re-parameterized x...

  eval(family$initialize) 
   
  if (!is.null(start0)) start <- start0 
  coef <- as.numeric(start)

  if (is.null(weights)) weights <- rep.int(1, nobs)
  if (is.null(offset)) offset <- rep.int(0, nobs)
 
  ## get log likelihood, grad and Hessian (w.r.t. coefs - not s.p.s) ...
  llf <- family$ll
  ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1) 
  ll0 <- ll$l - drop(t(coef)%*%St%*%coef)/2
  grad <- ll$lb - St%*%coef
  iconv <- max(abs(grad))<control$epsilon*abs(ll0)
  Hp <- -ll$lbb+St
  rank.checked <- FALSE ## not yet checked the intrinsic rank of problem 
  rank <- q
  eigen.fix <- FALSE
  converged <- FALSE
  check.deriv <- FALSE; eps <- 1e-5 
  drop <- NULL;bdrop <- rep(FALSE,q) ## by default nothing dropped
  perturbed <- 0 ## counter for number of times perturbation tried on possible saddle

  for (iter in 1:(2*control$maxit)) { ## main iteration
    ## get Newton step... 
    if (check.deriv) { ## code for checking derivatives when debugging
      fdg <- ll$lb*0; fdh <- ll$lbb*0
      for (k in 1:length(coef)) {
        coef1 <- coef;coef1[k] <- coef[k] + eps
        ll.fd <- llf(y,x,coef1,weights,family,offset=offset,deriv=1)
        fdg[k] <- (ll.fd$l-ll$l)/eps
        fdh[,k] <- (ll.fd$lb-ll$lb)/eps
      }
    } ## derivative checking end
    #grad <- ll$lb - St%*%coef 
    #Hp <- -ll$lbb+St
    kappaH <- kappa(Hp)
    D <- diag(Hp)
    if (sum(!is.finite(D))>0) stop("non finite values in Hessian")

    if (min(D)<=0) { ## 2/2/19 replaces any D<0 indicating indef
      Dthresh <- max(D)*sqrt(.Machine$double.eps) 
      if (-min(D) < Dthresh) { ## could be indef or +ve semi def
        indefinite <- FALSE
	D[D<Dthresh] <- Dthresh
      } else indefinite <- TRUE ## min D too negative to be semi def
    } else indefinite <- FALSE ## On basis of D could be +ve def
    
    if (indefinite) { ## Hessian indefinite, for sure
      ## D <- rep(1,ncol(Hp)) # moved to later otherwise Ip/Ib pointless 2/2/19
      if (eigen.fix) {
        eh <- eigen(Hp,symmetric=TRUE);
        ev <- abs(eh$values)
        Hp <- eh$vectors%*%(ev*t(eh$vectors))
      } else {
        Ib <- diag(rank)*abs(min(D))
        Ip <- diag(rank)*abs(max(D)*.Machine$double.eps^.5)
        Hp <- Hp  + Ip + Ib
      }
      D <- rep(1,ncol(Hp)) ## 2/2/19
      indefinite <- TRUE
    } else { ## Hessian could be +ve def in which case Choleski is cheap!
      D <- D^-.5 ## diagonal pre-conditioner
      Hp <- D*t(D*Hp) ## pre-condition Hp
      Ip <- diag(rank)*.Machine$double.eps^.5   
    }
    L <- suppressWarnings(chol(Hp,pivot=TRUE))
    mult <- 1
    while (attr(L,"rank") < rank) { ## rank deficient - add ridge penalty 
      if (eigen.fix) {
        eh <- eigen(Hp,symmetric=TRUE);ev <- eh$values
        thresh <- max(min(ev[ev>0]),max(ev)*1e-6)*mult
        mult <- mult*10
        ev[ev<thresh] <- thresh
        Hp <- eh$vectors%*%(ev*t(eh$vectors)) 
        L <- suppressWarnings(chol(Hp,pivot=TRUE))
      } else {
        L <- suppressWarnings(chol(Hp+Ip,pivot=TRUE))
        Ip <- Ip * 100 ## increase regularization penalty
      }
      indefinite <- TRUE
    }

    piv <- attr(L,"pivot")
    ipiv <- piv;ipiv[piv] <- 1:ncol(L)

    if (converged) break ## converged at end of previous step, so break now L and D match Hp
    
    #if (iconv&&!indefinite) break ## immediate convergence - bad idea - can lead to small sp changes having zero effect on objective.
    
    step <- D*(backsolve(L,forwardsolve(t(L),(D*grad)[piv]))[ipiv])

    c.norm <- sum(coef^2)
    if (c.norm>0) { ## limit step length to .1 of coef length
      s.norm <- sqrt(sum(step^2))
      c.norm <- sqrt(c.norm)
      if (s.norm > .1*c.norm) step <- step*0.1*c.norm/s.norm
    }
    s.norm <- sqrt(sum(step^2)) ## store for scaling steepest if needed
    ## try the Newton step...
    coef1 <- coef + step 
    ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=1) 
    ll1 <- ll$l - drop(t(coef1)%*%St%*%coef1)/2
    khalf <- 0;fac <- 2

    ## with ll1 < ll0 in place of ll1 <= ll0 in next line than we can repeatedly accept
    ## miniscule steps that do not actually improve anything.
    llold <- ll ## avoid losing lbb slot on stp failure
    no.change <- 0
    while ((!is.finite(ll1)||ll1 <= ll0) && khalf < 25) { ## step halve until it succeeds...
      step <- step/fac;coef1 <- coef + step
      ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=0)
      ll1 <- ll$l - (t(coef1)%*%St%*%coef1)/2
      if (is.finite(ll1)&&ll1>=ll0) { ## improvement, or at least no worse.
        ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=1)
      }
      if (ll1 == ll0) no.change <- no.change + 1 
      ## abort if step has made no difference (e.g. 2 iteractions of exactly no improvement)...
      if (max(abs(coef-coef1))<max(abs(coef))*.Machine$double.eps||no.change>1) khalf <- 100
      khalf <- khalf + 1
      if (khalf>5) fac <- 5
    } ## end step halve

    ## Following tries steepest ascent if Newton failed. Only do this if not initially converged.
    ## Otherwise it is possible for SA to mess up when routine is called
    ## with converged parameter values - takes a tiny step that leads to a machine noise
    ## improvement, but pushes a gradient just over threshold, Newton then continues to fail
    ## and we get stuck with SA convergence rates.
    ## One reason to try SA (if grad not already zero) is that Newton step can become
    ## poor due to numerical conditioning problems, whereas SA is largely unaffected.

    if (!is.finite(ll1) || (ll1<=ll0 && !iconv)) { ## switch to steepest ascent
      #step <- 0.5*drop(grad)*mean(abs(coef))/mean(abs(grad))
      step <- drop(grad)*s.norm/sqrt(sum(grad^2)) ## scaled to initial Newton step length
      khalf <- 0
    }

    no.change <- 0
    while ((!is.finite(ll1)||(ll1 <= ll0 && !iconv)) && khalf < 25) { ## step cut until it succeeds...
      step <- step/10;coef1 <- coef + step
      ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=0)
      ll1 <- ll$l - (t(coef1)%*%St%*%coef1)/2
      if (is.finite(ll1)&&ll1>=ll0) { ## improvement, or no worse
        ll <- llf(y,x,coef1,weights,family,offset=offset,deriv=1)
      }
      if (ll1 == ll0) no.change <- no.change + 1
      ## abort if step has made no difference...
      if (max(abs(coef-coef1))<max(abs(coef))*.Machine$double.eps||no.change>1) khalf <- 100 ## step gone nowhere
      khalf <- khalf + 1
    }

    if ((is.finite(ll1)&&ll1 >= ll0&&(khalf<25||indefinite))||iter==control$maxit) { ## step ok. Accept and test
      coef <- coef + step
      grad <- ll$lb - St%*%coef
      Hp <- -ll$lbb+St
      ## convergence test...
      ok <- (iter==control$maxit || max(abs(grad)) < control$epsilon*abs(ll0))
      if (ok) { ## appears to have converged
        if (indefinite) { ## not a well defined maximum
          if (perturbed==5) stop("indefinite penalized likelihood in gam.fit5 ")
          if (iter<4||rank.checked) {
            perturbed <- perturbed + 1
            coef <- coef*(1+(rep_len(c(0,1),length(coef))*.02-.01)*perturbed) + 
                    (rep_len(c(0,1),length(coef)) - 0.5 ) * mean(abs(coef))*1e-5*perturbed 
            ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1) 
            ll0 <- ll$l - (t(coef)%*%St%*%coef)/2
	    grad <- ll$lb - St%*%coef
            Hp <- -ll$lbb+St
          } else {        
            rank.checked <- TRUE
            if (penalized) {
              Sb <- crossprod(Eb) ## balanced penalty
              Hb <- -ll$lbb/norm(ll$lbb,"F")+Sb/norm(Sb,"F") ## balanced penalized hessian
            } else Hb <- -ll$lbb/norm(ll$lbb,"F")
            ## apply pre-conditioning, otherwise badly scaled problems can result in
            ## wrong coefs being dropped...
            D <- abs(diag(Hb))
            D[D<1e-50] <- 1;D <- D^-.5
            Hb <- t(D*Hb)*D
            qrh <- qr(Hb,LAPACK=TRUE)
            rank <- Rrank(qr.R(qrh))
            if (rank < q) { ## rank deficient. need to drop and continue to adjust other params
              drop <- sort(qrh$pivot[(rank+1):q]) ## set these params to zero 
              bdrop <- 1:q %in% drop ## TRUE FALSE version
              ## now drop the parameters and recompute ll0...
              lpi <- attr(x,"lpi")
              xat <- attributes(x)
              xat$dim <- xat$dimnames <- NULL
              coef <- coef[-drop]
              St <- St[-drop,-drop]
              x <- x[,-drop] ## dropping columns from model matrix
              if (!is.null(lpi)) { ## need to adjust column indexes as well
                ii <- (1:q)[!bdrop];ij <- rep(NA,q)
                ij[ii] <- 1:length(ii) ## col i of old model matrix is col ij[i] of new 
               
                for (i in 1:length(lpi)) {
                  lpi[[i]] <- ij[lpi[[i]][!(lpi[[i]]%in%drop)]] # drop and shuffle up
                }
              } ## lpi adjustment done
              if (length(xat)>0) for (i in 1:length(xat)) attr(x,names(xat)[i]) <- xat[[i]]
              attr(x,"lpi") <- lpi
              attr(x,"drop") <- drop ## useful if family has precomputed something from x
              ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1) 
              ll0 <- ll$l - (t(coef)%*%St%*%coef)/2
	      grad <- ll$lb - St%*%coef
              Hp <- -ll$lbb+St
            } 
          }

        } else { ## not indefinite really converged
          converged <- TRUE
          # ... don't break until L and D made to match final Hp
        }
      } else ll0 <- ll1 ## step ok but not converged yet
    } else { ## step failed.
      ll <- llold ## restore old ll with lbb slot
      if (is.null(drop)) bdrop <- rep(FALSE,q)
      if (iconv && iter==1) { ## OK to fail on first step if apparently converged to start with
        converged <- TRUE     ## Note: important to check if improvement possible even if apparently
	coef <- start         ## converged, otherwise sp changes can lead to no sp objective change!
      } else {
        converged  <- FALSE
	## NOTE: the threshold can be unrealistic if gradient or step can't be computed to this accuracy, e.g.
	## because of very large smoothing parameters - could estimate grad/step accuracy from machine zero
	## perturbation of grad/step calc, but perhaps too fussy.
	coefp <- coef*(1+rep_len(c(-1,1),length(coef))*.Machine$double.eps^.9)
	llp <- llf(y,x,coef,weights,family,offset=offset,deriv=1)
	gradp <- llp$lb - St%*%coefp
	err <- min(1e-3,kappaH*max(1,mean(abs(gradp-grad))/mean(abs(coefp-coef)))*.Machine$double.eps)
	## err is an estimate of the acheivable relative error, capped above at 1e-3 to ensure
	## this level of stability loss gets reported!
        if (max(abs(grad/drop(ll0)))>max(err,control$epsilon*2)) warn[[length(warn)+1]] <-
	  paste("gam.fit5 step failed: max magnitude relative grad =",max(abs(grad/drop(ll0))))
      }
      break ## no need to recompute L and D, so break now
    }
  } ## end of main fitting iteration

  ## at this stage the Hessian (of pen lik. w.r.t. coefs) should be +ve semi definite,
  ## so that the pivoted Choleski factor should exist...
  
  if (iter == 2*control$maxit&&converged==FALSE) 
    warn[[length(warn)+1]] <- gettextf("gam.fit5 iteration limit reached: max abs grad = %g",max(abs(grad)))

  ldetHp <- 2*sum(log(diag(L))) - 2 * sum(log(D)) ## log |Hp|

  if (!is.null(drop)) { ## create full version of coef with zeros for unidentifiable 
    fcoef <- rep(0,length(bdrop));fcoef[!bdrop] <- coef
  } else fcoef <- coef

  dVkk <- d1l <- d2l <- d1bSb <- d2bSb <- d1b <- d2b <- d1ldetH <- d2ldetH <- d1b <- d2b <- NULL
  ncv <- scoreType=="NCV"
  if (deriv>0) {  ## Implicit differentiation for derivs...

    m <- nSp
    d1b <- matrix(0,rank,m)
    Sib <- Sl.termMult(rp$Sl,fcoef,full=TRUE) ## list of penalties times coefs
    if (nSp) for (i in 1:m) d1b[,i] <- 
       -D*(backsolve(L,forwardsolve(t(L),(D*Sib[[i]][!bdrop])[piv]))[ipiv])
  
    ## obtain the curvature check matrix...
    dVkk <- crossprod(L[,ipiv]%*%(d1b/D))

    if (!is.null(drop)) { ## create full version of d1b with zeros for unidentifiable 
      fd1b <-  matrix(0,q,m)
      fd1b[!bdrop,] <- d1b
    } else fd1b <- d1b

    ## Now call the family again to get first derivative of Hessian w.r.t
    ## smoothing parameters, in list d1H. Alternatively, if deriv==1 and !ncv
    ## then tr(Hi d1H) returne as a vector in 'd1H'.

    ll <- if (ncv) llf(y,x,coef,weights,family,offset=offset,deriv=3,d1b=d1b,ncv=TRUE) else
                   llf(y,x,coef,weights,family,offset=offset,deriv=2+(deriv>1),d1b=d1b,fh=D*t(D*chol2inv(L)[ipiv,ipiv]))
    # d1l <- colSums(ll$lb*d1b) # cancels
    

    if (deriv>1) { ## Implicit differentiation for the second derivatives is now possible...

      d2b <- matrix(0,rank,m*(m+1)/2)
      k <- 0
      for (i in 1:m) for (j in i:m) {
        k <- k + 1
        v <- -ll$d1H[[i]]%*%d1b[,j] + Sl.mult(rp$Sl,fd1b[,j],i)[!bdrop] + Sl.mult(rp$Sl,fd1b[,i],j)[!bdrop]
        d2b[,k] <- -D*(backsolve(L,forwardsolve(t(L),(D*v)[piv]))[ipiv])
        if (i==j) d2b[,k] <- d2b[,k] + d1b[,i]
      } 
  
      ## Now call family for last time to get trHid2H the tr(H^{-1} d^2 H / drho_i drho_j)...

      llr <- llf(y,x,coef,weights,family,offset=offset,deriv=4,d1b=d1b,d2b=d2b,
                       Hp=Hp,rank=rank,fh = L,D=D)

      ## Now compute Hessian of log lik w.r.t. log sps using chain rule
       
      # d2la <- colSums(ll$lb*d2b) # cancels
      # k <- 0
      d2l <- matrix(0,m,m)
      for (i in 1:m) for (j in i:m) {
        # k <- k + 1
        d2l[j,i] <- d2l[i,j] <- # d2la[k] + # cancels
	                        t(d1b[,i])%*%ll$lbb%*%d1b[,j] 
      }
    } ## if (deriv > 1)
  } ## if (deriv > 0)

  Vg <- NULL ## jackknife grad matrix in NCV case

  if (scoreType=="NCV") {
    REML <- REML1 <- REML2 <- NULL
    if (deriv==0) ll <- llf(y,x,coef,weights,family,offset=offset,deriv=1,d1b=d1b,ncv=TRUE) ## otherwise l1, l2 not returned
    ncv <- family$ncv ## helps debugging!
    deriv1 <-  if (nei$jackknife>2) -1 else if (deriv==0) 0 else  1
    ## create nei if null - now in estimate.gam
    #if (is.null(nei)||is.null(nei$k)||is.null(nei$m)) nei <- list(i=1:nobs,mi=1:nobs,m=1:nobs,k=1:nobs) ## LOOCV
    #if (is.null(nei$i)) if (length(nei$m)==nobs) nei$mi <- nei$i <- 1:nobs else stop("unclear which points NCV neighbourhoods belong to")
    #if (length(nei$mi)!=length(nei$m)) stop("for NCV number of dropped and predicted neighbourhoods must match")
    ## complete dH
    if (deriv>0) {
      for (i in 1:length(ll$d1H)) ll$d1H[[i]] <- ll$d1H[[i]] - Sl.mult(rp$Sl,diag(q),i)[!bdrop,!bdrop] 
    }
  
    R1 <- try(chol(t(Hp/D)/D),silent=TRUE)
    ll$gamma <- gamma;
    ## note: use of quadratic approx to NCV signalled by family$qapprox
    #if (overlap||inherits(R1,"try-error"))
    if (inherits(R1,"try-error")) {
      ## get H (Hp?) and Hi
      Hi <- t(D*chol2inv(L)[ipiv,ipiv])*D
      ret <- ncv(x,y,weights,nei,coef,family,ll,H=t(Hp/D)/D,Hi=Hi,offset=offset,dH=ll$d1H,
                 db=d1b,deriv=deriv1)
      warn[[length(warn)+1]] <- "NCV update positive definite check not possible"
    } else { ## cholesky version
      ret <- ncv(x,y,weights,nei,coef,family,ll,R=R1,offset=offset,dH=ll$d1H,db=d1b,
                 deriv=deriv1,nt=control$ncv.threads)
      if (ret$error>0) warn[[length(warn)+1]] <- "some NCV updates not positive definite" 		 
    }
    NCV <- ret$NCV
    NCV1 <- ret$NCV1
    Vg <- ret$Vg
    if (deriv1<0) { ## Jackknife cov matrix...
      nk <- c(nei$m[1],diff(nei$m)) ## dropped fold sizes
      jkw <- sqrt((nobs-nk)/(nobs*nk)) ## jackknife weights
      dd <- attr(ret$NCV,"deta.cv")
      #dd <-jkw*t(dd)%*%t(T)
      dd <- Sl.repa(rp$rp,t(dd),l=-1) ## undo repara
      dd <- Sl.inirep(Sl,dd,l=1) ## undo initial repara
      #Vj <- tcrossprod(dd) ## jackknife cov matrix
      #attr(NCV,"Vj") <- Vj
      attr(NCV,"dd") <- jkw*t(dd)
    }
  } else { ## REML required
    NCV <- NCV1 <- NULL
    ## Compute the derivatives of log|H+S|... 
    if (deriv > 0) { ## only first derivative wrt rho required...
      if (deriv==1&&!is.list(ll$d1H)) {
        d1ldetH <- -ll$d1H
        for (i in 1:m) {
          A <- Sl.mult(rp$Sl,diag(q),i,full=TRUE)[!bdrop,!bdrop]
          bind <- rowSums(abs(A))!=0 ## row/cols of non-zero block
          A <- A[,bind,drop=FALSE] ## drop the zero columns  
          A <- D*(backsolve(L,forwardsolve(t(L),(D*A)[piv,]))[ipiv,,drop=FALSE])
          d1ldetH[i] <- d1ldetH[i] + sum(diag(A[bind,,drop=FALSE]))
        }
      } else { ## deriv>1 || is.list(ll$d1H)
        ## computation of d1ldetH using full d1H derivative of Hessian w.r.t. sp matrices...
        d1ldetH <- rep(0,m)
        d1Hp <- list()
        for (i in 1:m) {
          A <- -ll$d1H[[i]] + Sl.mult(rp$Sl,diag(q),i)[!bdrop,!bdrop]
          d1Hp[[i]] <- D*(backsolve(L,forwardsolve(t(L),(D*A)[piv,]))[ipiv,,drop=FALSE])  
          d1ldetH[i] <- sum(diag(d1Hp[[i]]))
        }
      }	
    } ## if (deriv >0)

    if (deriv > 1) {
      ## second derivatives...
      d2ldetH <- matrix(0,m,m)
      k <- 0
      for (i in 1:m) for (j in i:m) {
        k <- k + 1
        d2ldetH[i,j] <- -sum(d1Hp[[i]]*t(d1Hp[[j]])) - llr$trHid2H[k] 
        if (i==j) { ## need to add term relating to smoothing penalty
          A <- Sl.mult(rp$Sl,diag(q),i,full=TRUE)[!bdrop,!bdrop]
          bind <- rowSums(abs(A))!=0 ## row/cols of non-zero block
          A <- A[,bind,drop=FALSE] ## drop the zero columns  
          A <- D*(backsolve(L,forwardsolve(t(L),(D*A)[piv,]))[ipiv,,drop=FALSE])
          d2ldetH[i,j] <- d2ldetH[i,j] + sum(diag(A[bind,,drop=FALSE]))
        } else d2ldetH[j,i] <- d2ldetH[i,j]
      }
    } ## if (deriv > 1)

    ## Compute derivs of b'Sb...

    if (deriv>0) {
      Skb <- Sl.termMult(rp$Sl,fcoef,full=TRUE)
      d1bSb <- rep(0,m)
      for (i in 1:m) { 
        Skb[[i]] <- Skb[[i]][!bdrop]
        d1bSb[i] <- sum(coef*Skb[[i]])
      }
    }
 
    if (deriv>1) {
      d2bSb <- matrix(0,m,m)
      for (i in 1:m) {
        Sd1b <- St%*%d1b[,i] 
        for (j in i:m) {
          d2bSb[j,i] <- d2bSb[i,j] <- 2*sum( 
          d1b[,i]*Skb[[j]] + d1b[,j]*Skb[[i]] + d1b[,j]*Sd1b)
        }
        d2bSb[i,i] <-  d2bSb[i,i] + sum(coef*Skb[[i]]) 
      }
    }

    ## get grad and Hessian of REML score...
    REML <- -as.numeric((ll$l - drop(t(coef)%*%St%*%coef)/2)/gamma + rp$ldetS/2  - ldetHp/2  +
             Mp*(log(2*pi)/2)-log(gamma)/2)
 
    REML1 <- if (deriv<1) NULL else -as.numeric( # d1l # cancels
                                   - d1bSb/(2*gamma) + rp$ldet1/2  - d1ldetH/2 ) 
    REML2 <- if (deriv<2) NULL else -( (d2l - d2bSb/2)/gamma + rp$ldet2/2  - d2ldetH/2 ) 
  } ## REML computations
  
  if (control$trace) {
    cat("\niter =",iter,"  ll =",ll$l,"  REML =",REML,"  bSb =",t(coef)%*%St%*%coef/2,"\n")
    cat("log|S| =",rp$ldetS,"  log|H+S| =",ldetHp,"  n.drop =",length(drop),"\n")
    if (!is.null(REML1)) cat("REML1 =",REML1,"\n")
  }
  

  ## Get possibly multiple linear predictors
  lpi <- attr(x,"lpi")
  if (is.null(lpi)) { ## only one...
    linear.predictors <- if (is.null(offset)) as.numeric(x%*%coef) else as.numeric(x%*%coef+offset)
    fitted.values <- family$linkinv(linear.predictors) 
  } else { ## multiple...
    fitted.values <- linear.predictors <- matrix(0,nrow(x),length(lpi))
    if (!is.null(offset)) offset[[length(lpi)+1]] <- 0
    for (j in 1:length(lpi)) {
      linear.predictors[,j] <- as.numeric(x[,lpi[[j]],drop=FALSE] %*% coef[lpi[[j]]])
      if (!is.null(offset[[j]])) linear.predictors[,j] <-  linear.predictors[,j] + offset[[j]]
      fitted.values[,j] <- family$linfo[[j]]$linkinv( linear.predictors[,j]) 
    }
  }
  coef <- Sl.repara(rp$rp,fcoef,inverse=TRUE) ## undo re-parameterization of coef 
  #coef <- Sl.repa(rp$rp,fcoef,l=-1)
  if (!is.null(drop)&&!is.null(d1b)) { ## create full version of d1b with zeros for unidentifiable 
    db.drho <- matrix(0,length(bdrop),ncol(d1b));db.drho[!bdrop,] <- d1b
  } else db.drho <- d1b
  ## and undo re-para...
  #if (!is.null(d1b)) db.drho <- t(Sl.repara(rp$rp,t(db.drho),inverse=TRUE,both.sides=FALSE)) ## wrong
  if (!is.null(d1b)) db.drho <- Sl.repa(rp$rp,db.drho,l=-1)
  #if (!is.null(d2b)) d2b <- Sl.repa(rp$rp,d2b,l=-1) ## NOTE: DEBUG only
  ## Following needed for debugging H derivatives if Cholesky stabilization used
  #if (!is.null(ll$d1H)) for (i in 1:length(ll$d1H)) ll$d1H[[i]] <- Sl.repa(rp$rp,ll$d1H[[i]],l=2,r=1) ## debug
  #ll$lbb <- Sl.repa(rp$rp,ll$lbb,l=2,r=1) ## debug

  ret <- list(coefficients=coef,family=family,y=y,prior.weights=weights,
       fitted.values=fitted.values, linear.predictors=linear.predictors,
       scale.est=1, ### NOTE: needed by newton, but what is sensible here? 
       REML= REML,REML1= REML1,REML2=REML2,NCV=NCV,NCV1=NCV1,
       rank=rank,aic = -2*ll$l, ## 2*edf needs to be added
       ##deviance = -2*ll$l,
       l= ll$l,## l1 =d1l,l2 =d2l,
       lbb = ll$lbb, ## Hessian of log likelihood
       L=L, ## chol factor of pre-conditioned penalized hessian
       bdrop=bdrop, ## logical index of dropped parameters
       D=D, ## diagonal preconditioning matrix
       St=St, ## total penalty matrix
       rp = rp$rp,Vg=Vg, 
       db.drho = db.drho, ## derivative of penalty coefs w.r.t. log sps.
       #bSb = bSb, bSb1 =  d1bSb,bSb2 =  d2bSb,
       S1=rp$ldet1,
       #S=rp$ldetS,S1=rp$ldet1,S2=rp$ldet2,
       #Hp=ldetHp,Hp1=d1ldetH,Hp2=d2ldetH,
       #b2 = d2b,
       iter=iter,H = ll$lbb,dH = ll$d1H,dVkk=dVkk,warn=warn)#,d2H=llr$d2H)
    ## debugging code to allow components of 2nd deriv of hessian w.r.t. sp.s 
    ## to be passed to deriv.check.... 
    #if (!is.null(ll$ghost1)&&!is.null(ll$ghost2)) { 
    #  ret$ghost1 <- ll$ghost1; ret$ghost2 <- ret$ghost2
    #} 
    ret
} ## end of gam.fit5

efsud <- function(x,y,lsp,Sl,weights=NULL,offset=NULL,family,
                     control=gam.control(),Mp=-1,start=NULL) {
## Extended Fellner-Schall method for general families
## tr(S^-S_j) is returned by ldetS as ldet1 - S1 from gam.fit5
## b'S_jb is computed as d1bSb in gam.fit5
## tr(V S_j) will need to be computed using Sl.termMult
##   Sl returned by ldetS and Vb computed as in gam.fit5.postproc.
  tol <- 1e-6
  lsp <- lsp + 2.5
  mult <- 1
  fit <- gam.fit5(x=x,y=y,lsp=lsp,Sl=Sl,weights=weights,offset=offset,deriv=0,family=family,
                     control=control,Mp=Mp,start=start,gamma=1)
  score.hist <- rep(0,200)
  tiny <- .Machine$double.eps^.5 ## used to bound above zero
  for (iter in 1:200) {
    start <- fit$coefficients
    ## obtain Vb...
    ipiv <- piv <- attr(fit$L,"pivot")
    p <- length(piv)
    ipiv[piv] <- 1:p
    Vb <- crossprod(forwardsolve(t(fit$L),diag(fit$D,nrow=p)[piv,,drop=FALSE])[ipiv,,drop=FALSE])
    if (sum(fit$bdrop)) { ## some coefficients were dropped...
      q <- length(fit$bdrop)
      ibd <- !fit$bdrop
      Vtemp <- Vb; Vb <- matrix(0,q,q)
      Vb[ibd,ibd] <- Vtemp
    }
    Vb <- Sl.repara(fit$rp,Vb,inverse=TRUE)
    SVb <- Sl.termMult(Sl,Vb) ## this could be made more efficient
    trVS <- rep(0,length(SVb))
    for (i in 1:length(SVb)) {
      ind <- attr(SVb[[i]],"ind")
      trVS[i] <- sum(diag(SVb[[i]][,ind]))
    }
    Sb <- Sl.termMult(Sl,start,full=TRUE)
    bSb <- rep(0,length(Sb))
    for (i in 1:length(Sb)) {
      bSb[i] <- sum(start*Sb[[i]])
    }
   
    a <- pmax(tiny,fit$S1*exp(-lsp) - trVS)
    r <- a/pmax(tiny,bSb)
    r[a==0&bSb==0] <- 1
    r[!is.finite(r)] <- 1e6
    lsp1 <- pmin(lsp + log(r)*mult,control$efs.lspmax)
    max.step <- max(abs(lsp1-lsp))
    old.reml <- fit$REML
    fit <- gam.fit5(x=x,y=y,lsp=lsp1,Sl=Sl,weights=weights,offset=offset,deriv=0,
                    family=family,control=control,Mp=Mp,start=start,gamma=1)
    ## some step length control...
   
    if (fit$REML<=old.reml) { ## improvement
      if (max.step<.05) { ## consider step extension
        lsp2 <- pmin(lsp + log(r)*mult*2,12) ## try extending step...
        fit2 <- gam.fit5(x=x,y=y,lsp=lsp2,Sl=Sl,weights=weights,offset=offset,deriv=0,family=family,
                     control=control,Mp=Mp,start=start,gamma=1)
     
        if (fit2$REML < fit$REML) { ## improvement - accept extension
          fit <- fit2;lsp <- lsp2
	  mult <- mult * 2
        } else { ## accept old step
          lsp <- lsp1
        }
      } else lsp <- lsp1
    } else { ## no improvement 
      while (fit$REML > old.reml&&mult>1) { ## don't contract below 1 as update doesn't have to improve REML 
          mult <- mult/2 ## contract step
          lsp1 <- pmin(lsp + log(r)*mult,control$efs.lspmax)
	  fit <- gam.fit5(x=x,y=y,lsp=lsp1,Sl=Sl,weights=weights,offset=offset,deriv=0,family=family,
                        control=control,Mp=Mp,start=start,gamma=1)
      }
      lsp <- lsp1
      if (mult<1) mult <- 1
    }
    score.hist[iter] <- fit$REML
    ## break if EFS step small and REML change negligible over last 3 steps.
    if (iter>3 && max.step<.05 && max(abs(diff(score.hist[(iter-3):iter])))<control$efs.tol) break
    ## or break if log lik not changing...
    if (iter==1) old.ll <- fit$l else {
      if (abs(old.ll-fit$l)<100*control$eps*abs(fit$l)) break
      old.ll <- fit$l
    }
  }
  fit$sp <- exp(lsp)
  fit$iter <- iter
  fit$outer.info <- list(iter = iter,score.hist=score.hist[1:iter])
  fit$outer.info$conv <- if (iter==200) "iteration limit reached" else "full convergence"
  if (!is.null(fit$warn)&&length(fit$warn)>0) for (i in 1:length(fit$warn)) warning(fit$warn[[i]])
  fit
} ## efsud

gam.fit5.post.proc <- function(object,Sl,L,lsp0,S,off,gamma) {
## object is object returned by gam.fit5, Sl is penalty object, L maps working sp
## vector to full sp vector 
## Computes:
## R - unpivoted Choleski of estimated expected hessian of ll 
## Vb - the Bayesian cov matrix,
## Ve - "frequentist" alternative
## F - the EDF matrix
## edf = diag(F) and edf2 = diag(2F-FF)
## Main issue is that lbb and lbb + St must be +ve definite for
## F to make sense.
## NOTE: what comes in is in stabilizing parameterization from 
##       gam.fit5, and may have had parameters dropped. 
##       possibly initial reparam needs to be undone here as well
##       before formation of F....
  lbb <- -object$lbb ## Hessian of log likelihood in fit parameterization
  p <- ncol(lbb)
  ipiv <- piv <- attr(object$L,"pivot")
  ipiv[piv] <- 1:p
  ##  Vb0 <- crossprod(forwardsolve(t(object$L),diag(object$D,nrow=p)[piv,])[ipiv,])

  ## Bayes cov matrix with learning rate 1/gamma. Wrong - parameterization s.t. Vp*gamma is it
  #Vl <- if (gamma==1) NULL else chol2inv(chol(-object$lbb/gamma+object$St))
  

  ## need to pre-condition lbb before testing rank...
  lbb <- object$D*t(object$D*lbb)
 
  R <- suppressWarnings(chol(lbb,pivot=TRUE)) 
  
  if (attr(R,"rank") < ncol(R)) { 
    ## The hessian of the -ve log likelihood is not +ve definite
    ## Find the "nearest" +ve semi-definite version and use that
    retry <- TRUE;tol <- 0
    eh <- eigen(lbb,symmetric=TRUE)
    mev <- max(eh$values);dtol <- 1e-7
    while (retry) {
      eh$values[eh$values<tol*mev] <- tol*mev
      R <- sqrt(eh$values)*t(eh$vectors)
      lbb <- crossprod(R)
      Hp <- lbb + object$D*t(object$D*object$St) ## pre-conditioned Hp
      ## Now try to invert it by Choleski with diagonal pre-cond,
      ## to get Vb
      object$L <- suppressWarnings(chol(Hp,pivot=TRUE))
      if (attr(object$L,"rank")==ncol(Hp)) {
        R <- t(t(R)/object$D) ## so R'R = lbb (original)
        retry <- FALSE
      } else { ##  failure: make more +ve def
        tol <- tol + dtol;dtol <- dtol*10
      }
    } ## retry  
  } else { ## hessian +ve def, so can simply use what comes from fit directly
    ipiv <- piv <- attr(R,"pivot")
    ipiv[piv] <- 1:p
    R <- t(t(R[,ipiv])/object$D) ## so now t(R)%*%R = lbb (original)
  } 
  ## DL'LD = penalized Hessian, which needs to be inverted
  ## to DiLiLi'Di = Vb, the Bayesian cov matrix...
  ipiv <- piv <- attr(object$L,"pivot")
  ipiv[piv] <- 1:p
  Vb <- crossprod(forwardsolve(t(object$L),diag(object$D,nrow=p)[piv,,drop=FALSE])[ipiv,,drop=FALSE])

  ## Insert any zeroes required as a result of dropping 
  ## unidentifiable parameters...
  if (sum(object$bdrop)) { ## some coefficients were dropped...
    q <- length(object$bdrop)
    ibd <- !object$bdrop
    Vtemp <- Vb; Vb <- matrix(0,q,q)
    Vb[ibd,ibd] <- Vtemp
    Rtemp <- R; R <- matrix(0,q,q)
    R[ibd,ibd] <- Rtemp
    lbbt <- lbb;lbb <- matrix(0,q,q)
    lbb[ibd,ibd] <- lbbt
  }  

  edge.correct <- FALSE 
  ## compute the smoothing parameter uncertainty correction...
  if (!is.null(object$outer.info$hess)&&!is.null(object$db.drho)) {
    hess <- object$outer.info$hess
    edge.correct <- if (is.null(attr(hess,"edge.correct"))) FALSE else TRUE
    K <- if (edge.correct) 2 else 1
    for (k in 1:K) {
      if (k==1) { ## fitted model computations
        db.drho <- object$db.drho
        dw.drho <- object$dw.drho
        lsp <- log(object$sp)
      } else { ## edge corrected model computations
        db.drho <- attr(hess,"db.drho1")
        dw.drho <- attr(hess,"dw.drho1")
        lsp <- attr(hess,"lsp1")
	hess <- attr(hess,"hess1")
      }
      if (!is.null(L)) db.drho <- db.drho%*%L ## transform to derivs w.r.t. working
      ev <- eigen(hess,symmetric=TRUE)
      d <- ev$values;ind <- d <= 0
      d[ind] <- 0;d[!ind] <- 1/sqrt(d[!ind])
      db.drho <- Sl.inirep(Sl,db.drho,1,0)
      Vc <- crossprod((d*t(ev$vectors))%*%t(db.drho)) ## first correction
   
      d <- ev$values; d[ind] <- 0;
      d <- if (k==1) 1/sqrt(d+1/50) else 1/sqrt(d+1e-7)
  
      Vr <- crossprod(d*t(ev$vectors))
     
      if (k==1) {
        Vc1 <- Vc; Vr1 <- Vr; lsp1 <- lsp ## un-shifted version to use for edf
      } 
      ## reverse the various re-parameterizations...
    }
    rp <- if (edge.correct) attr(object$outer.info$hess,"rp") else object$rp
    #Vc <- Sl.repara(rp,Vc,inverse=TRUE) ## BUG: appears to be double correction - db already corrected??
    #Vc <-  Sl.initial.repara(Sl,Vc,inverse=TRUE)
    V.sp <- Vr;attr(V.sp,"L") <- L
  } else {
    Vc <- 0; db.drho <- object$db.drho
    V.sp <- NULL
  }  
  Vb <- Sl.repara(object$rp,Vb,inverse=TRUE)
  Vb <-  Sl.initial.repara(Sl,Vb,inverse=TRUE)
  Vc <- Vb + Vc
  if (edge.correct) { 
    Vc1 <- Sl.repara(object$rp,Vc1,inverse=TRUE) 
    Vc1 <-  Sl.initial.repara(Sl,Vc1,inverse=TRUE)
    Vc1 <- Vb + Vc1 
  } 
  #R <- Sl.repara(object$rp,R,inverse=TRUE,both.sides=FALSE)
  R <- Sl.repa(object$rp,R,r=1)
  R <-  Sl.initial.repara(Sl,R,inverse=TRUE,both.sides=FALSE,cov=FALSE)
  F <- Vb%*%crossprod(R)
  Ve <- F%*%Vb ## 'frequentist' cov matrix
  edf <- diag(F)
  ## note that edf1 is a heuristic upper bound on EDF - it's the 
  ## df of the best unpenalized approx to the 1st order bias corrected
  ## model. This is larger than edf2 should be, because of bias correction variability,
  ##  but is bounded in a way that is not *guaranteed* for edf2. Note that 
  ## justification only applies to sum(edf1/2) not elementwise   
  if (!is.null(object$outer.info$hess)&&!is.null(object$db.drho)) { 
    ## second correction term is easier computed in original parameterization...
    Vc <- Vc + Vb.corr(R,L,lsp0,S,off,dw=NULL,w=NULL,lsp,Vr)
    if (edge.correct) Vc1 <- Vc1 +
      Vb.corr(R,L,lsp0,S,off,dw=NULL,w=NULL,lsp1,Vr1) else Vc1 <- Vc
  }
  edf1 <- 2*edf - rowSums(t(F)*F) 
  edf2 <- if (edge.correct) rowSums(Vc1*crossprod(R)) else rowSums(Vc*crossprod(R))
  if (sum(edf2)>sum(edf1)) edf2 <- edf1 
  ## note hat not possible here...
  list(Vc=Vc,Vp=Vb,Ve=Ve,V.sp=V.sp,edf=edf,edf1=edf1,edf2=edf2,#F=F,
       R=R,db.drho=db.drho)
} ## gam.fit5.post.proc


deriv.check5 <- function(x, y, sp, 
            weights = rep(1, length(y)), start = NULL,
            offset = rep(0, length(y)),Mp,family = gaussian(), 
            control = gam.control(),deriv=2,eps=1e-7,spe=1e-3,
            Sl,gamma=1,nei=nei,...)
## FD checking of derivatives for gam.fit5: a debugging routine
{  if (!deriv%in%c(1,2)) stop("deriv should be 1 or 2")
   if (control$epsilon>1e-9) control$epsilon <- 1e-9 
   ## first obtain the fit corresponding to sp...
   b <- gam.fit5(x=x,y=y,lsp=sp,Sl=Sl,weights=weights,offset=offset,deriv=deriv,
        family=family,control=control,Mp=Mp,start=start,gamma=gamma,nei=nei)
   ## now get the derivatives of the likelihood w.r.t. coefs...
   ll <- family$ll(y=y,X=x,coef=b$coefficients,wt=weights,family=family,
                   deriv=1,d1b=0,d2b=0,Hp=NULL,rank=0,fh=NULL,D=NULL)
   ## and finite difference versions of these...
   p <- length(b$coefficients)
   fdg <- rep(0,p)
   fdh <- matrix(0,p,p)
   for (i in 1:p) {
     coef1 <- b$coefficients;coef1[i] <- coef1[i] + eps
     ll1 <- family$ll(y=y,X=x,coef=coef1,wt=weights,family=family,
                      deriv=1,d1b=0,d2b=0,Hp=NULL,rank=0,fh=NULL,D=NULL)
     fdg[i] <- (ll1$l - ll$l)/eps
     fdh[,i] <- (ll1$lb - ll$lb)/eps
   }
   ## display them... 
   oask <- devAskNewPage(TRUE)
   on.exit(devAskNewPage(oask))
   plot(ll$lb,fdg,xlab="computed",ylab="FD",main="grad of log lik");abline(0,1)
   cat("log lik grad cor. =",cor(ll$lb,fdg),"\n")
   plot(ll$lbb,fdh,xlab="computed",ylab="FD",main="hess of log lik");abline(0,1)
   cat("log lik hess cor. =",cor(as.numeric(ll$lbb),as.numeric(fdh)),"\n")
   ## now we need to investigate the derivatives w.r.t. the log smoothing parameters.    
   M <- length(sp) ## number of smoothing parameters
   fd.br <- matrix(0,p,M)
   REML1 <- rep(0,M)
   REML2 <- b$REML2
   fd.dH <- list()
   if (!is.null(b$b2)) fd.br2 <- b$b2*0
   k <- 0
   for (i in 1:M) { ## the smoothing parameter loop
     sp0 <- sp1 <- sp;sp1[i] <- sp[i] + spe/2;sp0[i] <- sp[i] - spe/2
     b0 <- gam.fit5(x=x,y=y,lsp=sp0,Sl=Sl,weights=weights,offset=offset,deriv=1,
          family=family,control=control,Mp=Mp,start=start,gamma=gamma,nei=nei)
     b1 <- gam.fit5(x=x,y=y,lsp=sp1,Sl=Sl,weights=weights,offset=offset,deriv=1,
          family=family,control=control,Mp=Mp,start=start,gamma=gamma,nei=nei)
     fd.br[,i] <- (b1$coefficients - b0$coefficients)/spe
     if (!is.null(b$b2)) {
       for (j in i:M) {
         k <- k + 1
         fd.br2[,k] <- (b1$db.drho[,j]-b0$db.drho[,j])/spe
       }	
     }  
     REML1[i] <- (b1$REML-b0$REML)/spe
     REML2[,i] <- (b1$REML1-b0$REML1)/spe
     fd.dH[[i]] <- (b1$lbb - b0$lbb)/spe
   }
   ## plot db.drho against fd versions...
   for (i in 1:M) {
     plot(b$db.drho[,i],fd.br[,i],xlab="computed",ylab="FD",main="db/drho");abline(0,1)
     cat("cor db/drho[",i,"] = ",cor(b$db.drho[,i],fd.br[,i]),"\n")
   }
   ## second deriv of b
   if (!is.null(b$b2)) for (i in 1:ncol(b$b2)) {
     plot(b$b2[,i],fd.br2[,i],xlab="computed",ylab="FD",main="d2b/drhorho");abline(0,1)
     cat("cor d2b[",i,"] = ",cor(b$b2[,i],fd.br2[,i]),"\n")
   }
   ## plot first deriv Hessian against FD version
   for (i in 1:M) {
     plot(b$dH[[i]],fd.dH[[i]],xlab="computed",ylab="FD",main="dH/drho");abline(0,1)
     cat("cor dH/drho[",i,"] = ",cor(as.numeric(b$dH[[i]]),as.numeric(fd.dH[[i]])),"\n")
   }
   list(fd=list(lb=fdg,lbb=fdh,REML1=REML1,db.drho=fd.br,dH=fd.dH),
        lb=ll$lb,lbb=ll$lbb,REML1=b$REML1,db.drho=b$db.drho,dH=b$dH)
} ## deriv.check5