## R routines for gam fitting with calculation of derivatives w.r.t. sp.s
## (c) Simon Wood 2004-2022

## These routines are for type 3 gam fitting. The basic idea is that a P-IRLS
## is run to convergence, and only then is a scheme for evaluating the 
## derivatives via the implicit function theorem used. 


gam.reparam <- function(rS,lsp,deriv) 
## Finds an orthogonal reparameterization which avoids `dominant machine zero leakage' between 
## components of the square root penalty.
## rS is the list of the square root penalties: last entry is root of fixed. 
##    penalty, if fixed.penalty=TRUE (i.e. length(rS)>length(sp))
## lsp is the vector of log smoothing parameters.
## *Assumption* here is that rS[[i]] are in a null space of total penalty already;
## see e.g. totalPenaltySpace & mini.roots
## Ouputs:
## S -- the total penalty matrix similarity transformed for stability
## rS -- the component square roots, transformed in the same way
##       - tcrossprod(rS[[i]]) = rS[[i]] %*% t(rS[[i]]) gives the matrix penalty component.
## Qs -- the orthogonal transformation matrix S = t(Qs)%*%S0%*%Qs, where S0 is the 
##       untransformed total penalty implied by sp and rS on input
## E -- the square root of the transformed S (obtained in a stable way by pre-conditioning)
## det -- log |S|
## det1 -- dlog|S|/dlog(sp) if deriv >0
## det2 -- hessian of log|S| wrt log(sp) if deriv>1  
{ q <- nrow(rS[[1]])
  rSncol <- unlist(lapply(rS,ncol))
  M <- length(lsp) 
  if (length(rS)>M) fixed.penalty <- TRUE else fixed.penalty <- FALSE
 
  d.tol <- .Machine$double.eps^.3 ## group `similar sized' penalties, to save work

  r.tol <- .Machine$double.eps^.75 ## This is a bit delicate -- too large and penalty range space can be supressed.

  oo <- .C(C_get_stableS,S=as.double(matrix(0,q,q)),Qs=as.double(matrix(0,q,q)),sp=as.double(exp(lsp)),
                  rS=as.double(unlist(rS)), rSncol = as.integer(rSncol), q = as.integer(q),
                  M = as.integer(M), deriv=as.integer(deriv), det = as.double(0), 
                  det1 = as.double(rep(0,M)),det2 = as.double(matrix(0,M,M)), 
                  d.tol = as.double(d.tol),
                  r.tol = as.double(r.tol),
                  fixed_penalty = as.integer(fixed.penalty))
  S <- matrix(oo$S,q,q)  
  S <- (S+t(S))*.5
  p <- abs(diag(S))^.5            ## by Choleski, p can not be zero if S +ve def
  p[p==0] <- 1                    ## but it's possible to make a mistake!!
  ##E <-  t(t(chol(t(t(S/p)/p)))*p) 
  St <- t(t(S/p)/p)
  St <- (St + t(St))*.5 ## force exact symmetry -- avoids very rare mroot fails 
  E <- t(mroot(St,rank=q)*p) ## the square root S, with column separation
  Qs <- matrix(oo$Qs,q,q)         ## the reparameterization matrix t(Qs)%*%S%*%Qs -> S
  k0 <- 1
  for (i in 1:length(rS)) { ## unpack the rS in the new space
    crs <- ncol(rS[[i]]);
    k1 <- k0 + crs * q - 1 
    rS[[i]] <- matrix(oo$rS[k0:k1],q,crs)
    k0 <- k1 + 1
  }
  ## now get determinant + derivatives, if required...
  if (deriv > 0) det1 <- oo$det1 else det1 <- NULL
  if (deriv > 1) det2 <- matrix(oo$det2,M,M) else det2 <- NULL  
  list(S=S,E=E,Qs=Qs,rS=rS,det=oo$det,det1=det1,det2=det2,fixed.penalty = fixed.penalty)
} ## gam.reparam


gam.fit3 <- function (x, y, sp, Eb,UrS=list(),
            weights = rep(1, nobs), start = NULL, etastart = NULL, 
            mustart = NULL, offset = rep(0, nobs),U1=diag(ncol(x)), Mp=-1, family = gaussian(), 
            control = gam.control(), intercept = TRUE,deriv=2,
            gamma=1,scale=1,printWarn=TRUE,scoreType="REML",null.coef=rep(0,ncol(x)),
            pearson.extra=0,dev.extra=0,n.true=-1,Sl=NULL,nei=NULL,...) {
## Inputs:
## * x model matrix
## * y response
## * sp log smoothing parameters
## * Eb square root of nicely balanced total penalty matrix used for rank detection
## * UrS list of penalty square roots in range space of overall penalty. UrS[[i]]%*%t(UrS[[i]]) 
##   is penalty. See 'estimate.gam' for more.
## * weights prior weights (reciprocal variance scale)
## * start initial values for parameters. ignored if etastart or mustart present (although passed on).
## * etastart initial values for eta
## * mustart initial values for mu. discarded if etastart present.
## * control - control list.
## * intercept - indicates whether model has one.
## * deriv - order 0,1 or 2 derivatives are to be returned (lower is cheaper!)
## * gamma - multiplier for effective degrees of freedom in GCV/UBRE.
## * scale parameter. Negative signals to estimate.
## * printWarn print or supress?
## * scoreType - type of smoothness selection to use.
## * null.coef - coefficients for a null model, in order to be able to check for immediate 
##   divergence.
## * pearson.extra is an extra component to add to the pearson statistic in the P-REML/P-ML 
##   case, only.
## * dev.extra is an extra component to add to the deviance in the REML and ML cases only.
## * n.true is to be used in place of the length(y) in ML/REML calculations,
##   and the scale.est only.
## 
## Version with new reparameterization and truncation strategy. Allows iterative weights 
## to be negative. Basically the workhorse routine for Wood (2011) JRSSB.
## A much modified version of glm.fit. Purpose is to estimate regression coefficients 
## and compute a smoothness selection score along with its derivatives.
##
    if (control$trace) { t0 <- proc.time();tc <- 0} 
    warn <- list()
    if (inherits(family,"extended.family")) { ## then actually gam.fit4/5 is needed
      if (inherits(family,"general.family")) {
        return(gam.fit5(x,y,sp,Sl=Sl,weights=weights,offset=offset,deriv=deriv,
                        family=family,scoreType=scoreType,control=control,Mp=Mp,start=start,gamma=gamma,nei=nei))
      } else
      return(gam.fit4(x, y, sp, Eb,UrS=UrS,
            weights = weights, start = start, etastart = etastart, 
            mustart = mustart, offset = offset,U1=U1, Mp=Mp, family = family, 
            control = control, deriv=deriv,gamma=gamma,
            scale=scale,scoreType=scoreType,null.coef=null.coef,nei=nei,...))
    }

    if (family$link==family$canonical) fisher <- TRUE else fisher=FALSE 
    ## ... if canonical Newton = Fisher, but Fisher cheaper!
    if (scale>0) scale.known <- TRUE else scale.known <- FALSE
    if (!scale.known&&scoreType%in%c("REML","ML","EFS")) { ## the final element of sp is actually log(scale)
      nsp <- length(sp)
      scale <- exp(sp[nsp])
      sp <- sp[-nsp]
    }
    if (!deriv%in%c(0,1,2)) stop("unsupported order of differentiation requested of gam.fit3")
    x <- as.matrix(x)  
    nSp <- length(sp)  
    if (nSp==0) deriv.sp <- 0 else deriv.sp <- deriv 

    rank.tol <- .Machine$double.eps*100 ## tolerance to use for rank deficiency

    xnames <- dimnames(x)[[2]]
    ynames <- if (is.matrix(y)) 
        rownames(y)
    else names(y)


    q <- ncol(x)

    if (length(UrS)) { ## find a stable reparameterization...
      
      grderiv <- if (scoreType=="EFS") 1 else deriv*as.numeric(scoreType%in%c("REML","ML","P-REML","P-ML")) 
      rp <- gam.reparam(UrS,sp,grderiv) ## note also detects fixed penalty if present
 ## Following is for debugging only...
 #     deriv.check <- FALSE
 #     if (deriv.check&&grderiv) {
 #       eps <- 1e-4
 #       fd.grad <- rp$det1
 #       for (i in 1:length(sp)) {
 #         spp <- sp; spp[i] <- spp[i] + eps/2
 #         rp1 <- gam.reparam(UrS,spp,grderiv)
 #         spp[i] <- spp[i] - eps #         rp0 <- gam.reparam(UrS,spp,grderiv)
 #         fd.grad[i] <- (rp1$det-rp0$det)/eps
 #       }
 #       print(fd.grad)
 #       print(rp$det1) 
 #     }

      T <- diag(q)
      T[1:ncol(rp$Qs),1:ncol(rp$Qs)] <- rp$Qs
      T <- U1%*%T ## new params b'=T'b old params
    
      null.coef <- t(T)%*%null.coef  
 
      if (!is.null(start)) start <- t(T)%*%start
    
      ## form x%*%T in parallel 
      x <- .Call(C_mgcv_pmmult2,x,T,0,0,control$nthreads)
      ## x <- x%*%T   ## model matrix 0(nq^2)
   
      rS <- list()
      for (i in 1:length(UrS)) {
        rS[[i]] <- rbind(rp$rS[[i]],matrix(0,Mp,ncol(rp$rS[[i]])))
      } ## square roots of penalty matrices in current parameterization
      Eb <- Eb%*%T ## balanced penalty matrix
      rows.E <- q-Mp
      Sr <- cbind(rp$E,matrix(0,nrow(rp$E),Mp))
      St <- rbind(cbind(rp$S,matrix(0,nrow(rp$S),Mp)),matrix(0,Mp,q))
    } else {
      grderiv <- 0
      T <- diag(q); 
      St <- matrix(0,q,q) 
      rSncol <- sp <- rows.E <- Eb <- Sr <- 0   
      rS <- list(0)
      rp <- list(det=0,det1 = rep(0,0),det2 = rep(0,0),fixed.penalty=FALSE)
    }
    iter <- 0;coef <- rep(0,ncol(x))

    if (scoreType=="EFS") {
      scoreType <- "REML" ## basically optimizing REML
      deriv <- 0 ## only derivatives of log|S|_+ required (see above)
    }

    conv <- FALSE
    n <- nobs <- NROW(y) ## n is just to keep codetools happy
    if (n.true <= 0) n.true <- nobs ## n.true is used in criteria in place of nobs
    nvars <- ncol(x)
    EMPTY <- nvars == 0
    if (is.null(weights)) 
        weights <- rep.int(1, nobs)
    if (is.null(offset)) 
        offset <- rep.int(0, nobs)
    variance <- family$variance
    dev.resids <- family$dev.resids
    aic <- family$aic
    linkinv <- family$linkinv
    mu.eta <- family$mu.eta
    if (!is.function(variance) || !is.function(linkinv)) 
        stop("illegal `family' argument")
    valideta <- family$valideta
    if (is.null(valideta)) 
        valideta <- function(eta) TRUE
    validmu <- family$validmu
    if (is.null(validmu)) 
       validmu <- function(mu) TRUE

    if (is.null(mustart)) {
        eval(family$initialize)
    } else {
        mukeep <- mustart
        eval(family$initialize)
        mustart <- mukeep
    }

    ## Added code
    if (family$family=="gaussian"&&family$link=="identity") strictly.additive <- TRUE else
      strictly.additive <- FALSE

    ## end of added code

    D1 <- D2 <- P <- P1 <- P2 <- trA <- trA1 <- trA2 <- 
        GCV<- GCV1<- GCV2<- GACV<- GACV1<- GACV2<- UBRE <-
        UBRE1<- UBRE2<- REML<- REML1<- REML2 <- NCV <- NCV1 <- NULL

    if (EMPTY) {
        eta <- rep.int(0, nobs) + offset
        if (!valideta(eta)) 
            stop("Invalid linear predictor values in empty model")
        mu <- linkinv(eta)
        if (!validmu(mu)) 
            stop("Invalid fitted means in empty model")
        dev <- sum(dev.resids(y, mu, weights))
        w <- (weights * mu.eta(eta)^2)/variance(mu)   ### BUG: incorrect for Newton
        residuals <- (y - mu)/mu.eta(eta)
        good <- rep(TRUE, length(residuals))
        boundary <- conv <- TRUE
        coef <- numeric(0)
        iter <- 0
        V <- variance(mu)
        alpha <- dev
        trA2 <- trA1 <- trA <- 0
        if (deriv) GCV2 <- GCV1<- UBRE2 <- UBRE1<-trA1 <- rep(0,nSp)
        GCV <- nobs*alpha/(nobs-gamma*trA)^2
        UBRE <- alpha/nobs - scale + 2*gamma/n*trA
        scale.est <- alpha / (nobs - trA)
    } ### end if (EMPTY)
    else {
       
        eta <- if (!is.null(etastart)) 
            etastart
        else if (!is.null(start)) 
            if (length(start) != nvars) 
                stop(gettextf("Length of start should equal %d and correspond to initial coefs for %s", 
                     nvars, deparse(xnames)))
            else {
                coefold <- start
                offset + as.vector(if (NCOL(x) == 1) 
                  x * start
                else x %*% start)
            }
        else family$linkfun(mustart)
       
        mu <- linkinv(eta)
      
        boundary <- conv <- FALSE
        rV=matrix(0,ncol(x),ncol(x))   
       
        ## need an initial `null deviance' to test for initial divergence... 
        ## Note: can be better not to shrink towards start on
        ## immediate failure, in case start is on edge of feasible space...
        ## if (!is.null(start)) null.coef <- start
        coefold <- null.coef
        etaold <- null.eta <- as.numeric(x%*%null.coef + as.numeric(offset))
        old.pdev <- sum(dev.resids(y, linkinv(null.eta), weights)) + t(null.coef)%*%St%*%null.coef 
        ## ... if the deviance exceeds this then there is an immediate problem
        ii <- 0
        while (!(validmu(mu) && valideta(eta))) { ## shrink towards null.coef if immediately invalid
          ii <- ii + 1
          if (ii>20) stop("Can't find valid starting values: please specify some")
          if (!is.null(start)) start <- start * .9 + coefold * .1
          eta <- .9 * eta + .1 * etaold  
          mu <- linkinv(eta)
        }
        zg <- rep(0,max(dim(x)))
        for (iter in 1:control$maxit) { ## start of main fitting iteration
            good <- weights > 0
            var.val <- variance(mu)
            varmu <- var.val[good]
            if (any(is.na(varmu))) 
                stop("NAs in V(mu)")
            if (any(varmu == 0)) 
                stop("0s in V(mu)")
            mu.eta.val <- mu.eta(eta)
            if (any(is.na(mu.eta.val[good]))) 
                stop("NAs in d(mu)/d(eta)")
            
            good <- (weights > 0) & (mu.eta.val != 0)
         
            if (all(!good)) {
                conv <- FALSE
                warn[[length(warn)+1]] <- gettextf("gam.fit3 no observations informative at iteration %d", iter)
                break
            }
            mevg<-mu.eta.val[good];mug<-mu[good];yg<-y[good]
            weg<-weights[good];var.mug<-var.val[good]
            if (fisher) { ## Conventional Fisher scoring
              z <- (eta - offset)[good] + (yg - mug)/mevg
              w <- (weg * mevg^2)/var.mug
            } else { ## full Newton
              c = yg - mug
              alpha <- 1 + c*(family$dvar(mug)/var.mug + family$d2link(mug)*mevg)
              alpha[alpha==0] <- .Machine$double.eps
              z <- (eta - offset)[good] + (yg-mug)/(mevg*alpha) 
              ## ... offset subtracted as eta = X%*%beta + offset
              w <- weg*alpha*mevg^2/var.mug
            }

            ##if (sum(good)<ncol(x)) stop("Not enough informative observations.")

            if (control$trace) t1 <- proc.time()

            ng <- sum(good);zg[1:ng] <- z ## ensure y dim large enough for beta in all cases (including p>n)
            oo <- .C(C_pls_fit1,y=as.double(zg),X=as.double(x[good,]),w=as.double(w),wy=as.double(w*z),
                     E=as.double(Sr),Es=as.double(Eb),n=as.integer(ng),
                     q=as.integer(ncol(x)),rE=as.integer(rows.E),eta=as.double(z),
                     penalty=as.double(1),rank.tol=as.double(rank.tol),nt=as.integer(control$nthreads),
                     use.wy=as.integer(0))
            if (control$trace) tc <- tc + sum((proc.time()-t1)[c(1,4)])

            if (!fisher&&oo$n<0) { ## likelihood indefinite - switch to Fisher for this step
              z <- (eta - offset)[good] + (yg - mug)/mevg
              w <- (weg * mevg^2)/var.mug
	      ng <- sum(good);zg[1:ng] <- z ## ensure y dim large enough for beta in all cases
              if (control$trace) t1 <- proc.time()
              oo <- .C(C_pls_fit1,y=as.double(zg),X=as.double(x[good,]),w=as.double(w),wy=as.double(w*z),
                       E=as.double(Sr),Es=as.double(Eb),n=as.integer(ng),
                       q=as.integer(ncol(x)),rE=as.integer(rows.E),eta=as.double(z),
                       penalty=as.double(1),rank.tol=as.double(rank.tol),nt=as.integer(control$nthreads),
                       use.wy=as.integer(0))
              if (control$trace) tc <- tc + sum((proc.time()-t1)[c(1,4)])
            }

            start <- oo$y[1:ncol(x)];
            penalty <- oo$penalty
            eta <- drop(x%*%start)

            if (any(!is.finite(start))) {
                conv <- FALSE
                warn[[length(warn)+1]] <-gettextf("gam.fit3 non-finite coefficients at iteration %d", 
                  iter)
                break
            }        
     
           mu <- linkinv(eta <- eta + offset)
           dev <- sum(dev.resids(y, mu, weights))
          
           if (control$trace) 
             message(gettextf("Deviance = %s Iterations - %d", dev, iter, domain = "R-mgcv"))
            boundary <- FALSE
            
            if (!is.finite(dev)) {
                if (is.null(coefold)) {
                  if (is.null(null.coef)) 
                  stop("no valid set of coefficients has been found:please supply starting values", 
                    call. = FALSE)
                  ## Try to find feasible coefficients from the null.coef and null.eta
                  coefold <- null.coef
                  etaold <- null.eta
                }
                warn[[length(warn)+1]] <- "gam.fit3 step size truncated due to divergence"
                ii <- 1
                while (!is.finite(dev)) {
                  if (ii > control$maxit) 
                    stop("inner loop 1; can't correct step size")
                  ii <- ii + 1
                  start <- (start + coefold)/2
                  eta <- (eta + etaold)/2               
                  mu <- linkinv(eta)
                  dev <- sum(dev.resids(y, mu, weights))
                }
                boundary <- TRUE 
                penalty <- t(start)%*%St%*%start
                if (control$trace) 
                  cat("Step halved: new deviance =", dev, "\n")
            }
            if (!(valideta(eta) && validmu(mu))) {
                warn[[length(warn)+1]] <- "gam.fit3 step size truncated: out of bounds"
                ii <- 1
                while (!(valideta(eta) && validmu(mu))) {
                  if (ii > control$maxit) 
                    stop("inner loop 2; can't correct step size")
                  ii <- ii + 1
                  start <- (start + coefold)/2
                  eta <- (eta + etaold)/2 
                  mu <- linkinv(eta)
                }
                boundary <- TRUE
                penalty <- t(start)%*%St%*%start
                dev <- sum(dev.resids(y, mu, weights))
                if (control$trace) 
                  cat("Step halved: new deviance =", dev, "\n")
            }

            pdev <- dev + penalty  ## the penalized deviance 

            if (control$trace) 
                message(gettextf("penalized deviance = %s", pdev, domain = "R-mgcv"))
               

            div.thresh <- 10*(.1+abs(old.pdev))*.Machine$double.eps^.5 
            ## ... threshold for judging divergence --- too tight and near
            ## perfect convergence can cause a failure here

            if (pdev-old.pdev>div.thresh) { ## solution diverging
             ii <- 1 ## step halving counter
             if (iter==1) { ## immediate divergence, need to shrink towards zero 
               etaold <- null.eta; coefold <- null.coef
             }
             while (pdev -old.pdev > div.thresh)  
             { ## step halve until pdev <= old.pdev
                if (ii > 100) 
                   stop("inner loop 3; can't correct step size")
                ii <- ii + 1
                start <- (start + coefold)/2 
                eta <- (eta + etaold)/2               
                mu <- linkinv(eta)
                  dev <- sum(dev.resids(y, mu, weights))
                  pdev <- dev + t(start)%*%St%*%start ## the penalized deviance
                if (control$trace) 
                  message(gettextf("Step halved: new penalized deviance = %g", pdev, "\n"))
              }
            } 
            
            if (strictly.additive) { conv <- TRUE;coef <- start;break;}

            if (abs(pdev - old.pdev) < control$epsilon*(abs(scale)+abs(pdev))) {
               ## Need to check coefs converged adequately, to ensure implicit differentiation
               ## ok. Testing coefs unchanged is problematic under rank deficiency (not guaranteed to
               ## drop same parameter every iteration!)       
               grad <- 2 * t(x[good,])%*%(w*((x%*%start)[good]-z))+ 2*St%*%start 
               #if (max(abs(grad)) > control$epsilon*max(abs(start+coefold))/2) {
	       if (max(abs(grad)) > control$epsilon*(abs(pdev)+abs(scale))) {
                  old.pdev <- pdev
                  coef <- coefold <- start
                  etaold <- eta 
                } else {
                  conv <- TRUE
                  coef <- start
                  etaold <- eta
                  break 
                }
            }
            else {  old.pdev <- pdev
                coef <- coefold <- start
                etaold <- eta 
            }
        } ### end main loop 
       
        wdr <- dev.resids(y, mu, weights)
        dev <- sum(wdr) 
        #wdr <- sign(y-mu)*sqrt(pmax(wdr,0)) ## used below in scale estimation 
  
        ## Now call the derivative calculation scheme. This requires the
        ## following inputs:
        ## z and w - the pseudodata and weights
        ## X the model matrix and E where EE'=S
        ## rS the single penalty square roots
        ## sp the log smoothing parameters
        ## y and mu the data and model expected values
        ## g1,g2,g3 - the first 3 derivatives of g(mu) wrt mu
        ## V,V1,V2 - V(mu) and its first two derivatives wrt mu
        ## on output it returns the gradient and hessian for
        ## the deviance and trA 

         good <- weights > 0
         var.val <- variance(mu)
         varmu <- var.val[good]
         if (any(is.na(varmu))) stop("NAs in V(mu)")
         if (any(varmu == 0)) stop("0s in V(mu)")
         mu.eta.val <- mu.eta(eta)
         if (any(is.na(mu.eta.val[good]))) 
                stop("NAs in d(mu)/d(eta)")
   

         good <- (weights > 0) & (mu.eta.val != 0)
         mevg <- mu.eta.val[good];mug <- mu[good];yg <- y[good]
         weg <- weights[good];etag <- eta[good]
         var.mug<-var.val[good]

         if (fisher) { ## Conventional Fisher scoring
              z <- (eta - offset)[good] + (yg - mug)/mevg
              w <- (weg * mevg^2)/var.mug
              alpha <- wf <- 0 ## Don't need Fisher weights separately
         } else { ## full Newton
              c <- yg - mug
              alpha <- 1 + c*(family$dvar(mug)/var.mug + family$d2link(mug)*mevg)
              ### can't just drop obs when alpha==0, as they are informative, but
              ### happily using an `effective zero' is stable here, and there is 
              ### a natural effective zero, since E(alpha) = 1.
              alpha[alpha==0] <- .Machine$double.eps 
              z <- (eta - offset)[good] + (yg-mug)/(mevg*alpha) 
              ## ... offset subtracted as eta = X%*%beta + offset
              wf <- weg*mevg^2/var.mug ## Fisher weights for EDF calculation
              w <- wf * alpha   ## Full Newton weights
         }
        
         g1 <- 1/mevg
         g2 <- family$d2link(mug)
         g3 <- family$d3link(mug)

         V <- family$variance(mug)
         V1 <- family$dvar(mug)
         V2 <- family$d2var(mug)      
        
         if (fisher) {
           g4 <- V3 <- 0
         } else {
           g4 <- family$d4link(mug)
           V3 <- family$d3var(mug)
         }

         if (TRUE) { ### TEST CODE for derivative ratio based versions of code... 
           g2 <- g2/g1;g3 <- g3/g1;g4 <- g4/g1
           V1 <- V1/V;V2 <- V2/V;V3 <- V3/V
         }

         P1 <- D1 <- array(0,nSp);P2 <- D2 <- matrix(0,nSp,nSp) # for derivs of deviance/ Pearson
         trA1 <- array(0,nSp);trA2 <- matrix(0,nSp,nSp) # for derivs of tr(A)
         rV=matrix(0,ncol(x),ncol(x));
         dum <- 1
         if (control$trace) cat("calling gdi...")

       REML <- 0 ## signals GCV/AIC used
       if (scoreType%in%c("REML","P-REML","NCV")) {REML <- 1;remlInd <- 1} else 
       if (scoreType%in%c("ML","P-ML")) {REML <- -1;remlInd <- 0} 

       if (REML==0) rSncol <- unlist(lapply(rS,ncol)) else rSncol <- unlist(lapply(UrS,ncol))
       if (control$trace) t1 <- proc.time()
       oo <- .C(C_gdi1,X=as.double(x[good,]),E=as.double(Sr),Eb = as.double(Eb), 
                rS = as.double(unlist(rS)),U1=as.double(U1),sp=as.double(exp(sp)),
                z=as.double(z),w=as.double(w),wf=as.double(wf),alpha=as.double(alpha),
                mu=as.double(mug),eta=as.double(etag),y=as.double(yg),
                p.weights=as.double(weg),g1=as.double(g1),g2=as.double(g2),
                g3=as.double(g3),g4=as.double(g4),V0=as.double(V),V1=as.double(V1),
                V2=as.double(V2),V3=as.double(V3),beta=as.double(coef),b1=as.double(rep(0,nSp*ncol(x))),
                w1=as.double(rep(0,nSp*length(z))),
                D1=as.double(D1),D2=as.double(D2),P=as.double(dum),P1=as.double(P1),P2=as.double(P2),
                trA=as.double(dum),trA1=as.double(trA1),trA2=as.double(trA2),
                rV=as.double(rV),rank.tol=as.double(rank.tol),
                conv.tol=as.double(control$epsilon),rank.est=as.integer(1),n=as.integer(length(z)),
                p=as.integer(ncol(x)),M=as.integer(nSp),Mp=as.integer(Mp),Enrow = as.integer(rows.E),
                rSncol=as.integer(rSncol),deriv=as.integer(deriv.sp),
                REML = as.integer(REML),fisher=as.integer(fisher),
                fixed.penalty = as.integer(rp$fixed.penalty),nthreads=as.integer(control$nthreads),
                dVkk=as.double(rep(0,nSp*nSp)))      
         if (control$trace) { 
           tg <- sum((proc.time()-t1)[c(1,4)])
           cat("done!\n")
         }
 
         ## get dbeta/drho, original parameterization restored on return
         db.drho <- if (deriv) matrix(oo$b1,ncol(x),nSp) else NULL
         dw.drho <- if (deriv) matrix(oo$w1,length(z),nSp) else NULL ## NOTE: only deriv of Newton weights if REML=1 in gdi1 call

         rV <- matrix(oo$rV,ncol(x),ncol(x)) ## rV%*%t(rV)*scale gives covariance matrix 
         
         Kmat <- matrix(0,nrow(x),ncol(x)) 
         Kmat[good,] <- oo$X                    ## rV%*%t(K)%*%(sqrt(wf)*X) = F; diag(F) is edf array 

         coef <- oo$beta;
         eta <- drop(x%*%coef + offset)
         mu <- linkinv(eta)
         if (!(validmu(mu)&&valideta(eta))) {
           ## if iteration terminated with step halving then it can be that
           ## gdi1 returns an invalid coef, because likelihood is actually
           ## pushing coefs to invalid region. Probably not much hope in 
           ## this case, but it is worth at least returning feasible values,
           ## even though this is not quite consistent with derivs.
           coef <- start
           eta <- etaold
           mu <- linkinv(eta)
         }
         trA <- oo$trA;
                  
         if (control$scale.est%in%c("pearson","fletcher","Pearson","Fletcher")) {
            pearson <- sum(weights*(y-mu)^2/family$variance(mu))
            scale.est <- (pearson+dev.extra)/(n.true-trA)
            if (control$scale.est%in%c("fletcher","Fletcher")) { ## Apply Fletcher (2012) correction
              ## note limited to 10 times Pearson...
              #s.bar = max(-.9,mean(family$dvar(mu)*(y-mu)*sqrt(weights)/family$variance(mu)))
	      s.bar = max(-.9,mean(family$dvar(mu)*(y-mu)/family$variance(mu)))
              if (is.finite(s.bar)) scale.est <- scale.est/(1+s.bar)
            }
         } else { ## use the deviance estimator
           scale.est <- (dev+dev.extra)/(n.true-trA)
         }

        reml.scale <- NA
	Vg=NULL ## empirical cov matrix for grad (see NCV)

        if (scoreType%in%c("REML","ML")) { ## use Laplace (RE)ML
          
          ls <- family$ls(y,weights,n,scale)*n.true/nobs ## saturated likelihood and derivatives
          Dp <- dev + oo$conv.tol + dev.extra
          REML <- (Dp/(2*scale) - ls[1])/gamma + oo$rank.tol/2 - rp$det/2 -
	          remlInd*(Mp/2*(log(2*pi*scale)-log(gamma)))
          attr(REML,"Dp") <- Dp/(2*scale)
          if (deriv) {
            REML1 <- oo$D1/(2*scale*gamma) + oo$trA1/2 - rp$det1/2 
            if (deriv==2) REML2 <- (matrix(oo$D2,nSp,nSp)/(scale*gamma) + matrix(oo$trA2,nSp,nSp) - rp$det2)/2
            if (sum(!is.finite(REML2))) {
               stop("Non finite derivatives. Try decreasing fit tolerance! See `epsilon' in `gam.contol'")
            }
          }
          if (!scale.known&&deriv) { ## need derivatives wrt log scale, too 
            ##ls <- family$ls(y,weights,n,scale) ## saturated likelihood and derivatives
            dlr.dlphi <- (-Dp/(2 *scale) - ls[2]*scale)/gamma - Mp/2*remlInd
            d2lr.d2lphi <- (Dp/(2*scale) - ls[3]*scale^2 - ls[2]*scale)/gamma
            d2lr.dspphi <- -oo$D1/(2*scale*gamma)
            REML1 <- c(REML1,dlr.dlphi)
            if (deriv==2) {
              REML2 <- rbind(REML2,as.numeric(d2lr.dspphi))
              REML2 <- cbind(REML2,c(as.numeric(d2lr.dspphi),d2lr.d2lphi))
            }
          }
          reml.scale <- scale
        } else if (scoreType%in%c("P-REML","P-ML")) { ## scale unknown use Pearson-Laplace REML
          reml.scale <- phi <- (oo$P*(nobs-Mp)+pearson.extra)/(n.true-Mp) ## REMLish scale estimate
          ## correct derivatives, if needed...
          oo$P1 <- oo$P1*(nobs-Mp)/(n.true-Mp)
          oo$P2 <- oo$P2*(nobs-Mp)/(n.true-Mp)

          ls <- family$ls(y,weights,n,phi)*n.true/nobs ## saturated likelihood and derivatives
        
          Dp <- dev + oo$conv.tol + dev.extra
         
          K <- oo$rank.tol/2 - rp$det/2
                 
          REML <- (Dp/(2*phi) - ls[1]) + K - Mp/2*(log(2*pi*phi))*remlInd
          attr(REML,"Dp") <- Dp/(2*phi)
          if (deriv) {
            phi1 <- oo$P1; Dp1 <- oo$D1; K1 <- oo$trA1/2 - rp$det1/2;
            REML1 <- Dp1/(2*phi) - phi1*(Dp/(2*phi^2)+Mp/(2*phi)*remlInd + ls[2]) + K1
            if (deriv==2) {
                   phi2 <- matrix(oo$P2,nSp,nSp);Dp2 <- matrix(oo$D2,nSp,nSp)
                   K2 <- matrix(oo$trA2,nSp,nSp)/2 - rp$det2/2   
                   REML2 <- 
                   Dp2/(2*phi) - (outer(Dp1,phi1)+outer(phi1,Dp1))/(2*phi^2) +
                   (Dp/phi^3 - ls[3] + Mp/(2*phi^2)*remlInd)*outer(phi1,phi1) -
                   (Dp/(2*phi^2)+ls[2]+Mp/(2*phi)*remlInd)*phi2 + K2
            }
          }
 
        } else if (scoreType %in% "NCV") { ## neighbourhood cross validation
	   ## requires that neighbours are supplied in nei (if NULL) each point is its own
	   ## neighbour recovering LOOCV. nei$k[nei$m[i-1]+1):nei$m[i]] are the indices of
	   ## neighbours of point i, where nei$m[0]=0 by convention.
	 
	   ww <- w1 <- rep(0,nobs)
	   ww[good] <- weg*(yg - mug)*mevg/var.mug
	   w1[good] <- w
	   eta.cv <- rep(0.0,length(nei$i))
	   deta.cv <- if (deriv) matrix(0.0,length(nei$i),length(rS)) else matrix(0.0,1,length(rS))
           if (nei$jackknife>2) { ## need coef changes for each NCV drop fold.
             dd <- matrix(0,ncol(x),length(nei$m))
	     if (deriv>0) stop("jackknife and derivatives requested together")
	     deriv1 <- -1 ## signal that coef changes to be returned
           } else { ## dd unused
             dd <- matrix(1.0,1,1);
	     deriv1 <- deriv
	   }  
	   R <- try(chol(crossprod(x,w1*x)+St),silent=TRUE)
	   if (inherits(R,"try-error")) { ## use CG approach...
	     Hi <- tcrossprod(rV) ## inverse of penalized Expected Hessian - inverse actual Hessian probably better
             cg.iter <- .Call(C_ncv,x,Hi,ww,w1,db.drho,dw.drho,rS,nei$i-1,nei$mi,nei$m,nei$k-1,coef,exp(sp),eta.cv, deta.cv, dd, deriv1);
	     warn[[length(warn)+1]] <- "NCV positive definite update check not possible"
           } else { ## use Cholesky update approach
	     pdef.fails <- .Call(C_Rncv,x,R,ww,w1,db.drho,dw.drho,rS,nei$i-1,nei$mi,nei$m,nei$k-1,coef,exp(sp),eta.cv, deta.cv,
	                         dd, deriv1,.Machine$double.eps,control$ncv.threads);
	     if (pdef.fails) warn[[length(warn)+1]] <- "some NCV updates not positive definite"
	   }
	  
	   if (family$qapprox) {
             NCV <- sum(wdr[nei$i]) + gamma*sum(-2*ww[nei$i]*(eta.cv-eta[nei$i]) + w1[nei$i]*(eta.cv-eta[nei$i])^2)
             if (deriv) {
	       deta <- x%*%db.drho
	       alpha1 <- if (fisher) 0 else (-(V1+g2) + (y-mu)*(V2-V1^2+g3-g2^2))/alpha
	       w3 <- w1/g1*(alpha1 - V1 - 2 * g2)
               ncv1 <- -2*ww[nei$i]*((1-gamma)*deta[nei$i,] + gamma*deta.cv) + 2*gamma*w1[nei$i]*(deta.cv*(eta.cv-eta[nei$i])) +
	                gamma*w3[nei$i]* deta[nei$i,]*(eta.cv-eta[nei$i])^2
             }
           } else { ## exact version
             if (TRUE) {
	       ## version that doesn't just drop neighbourhood, but tries equivalent (gamma==2) perturbation beyond dropping
               eta.cv <- gamma*(eta.cv) - (gamma-1)*eta[nei$i]
	       if (deriv && gamma!=1) deta.cv <- gamma*(deta.cv) - (gamma-1)*(x%*%db.drho)[nei$i,,drop=FALSE]
	       gamma <- 1
             }
     	     mu.cv <- linkinv(eta.cv)
	     NCV <- gamma*sum(dev.resids(y[nei$i],mu.cv,weights[nei$i])) - (gamma-1)*sum(wdr[nei$i]) ## the NCV score - simply LOOCV if nei(i) = i for all i
	    
             if (deriv) {
               dev1 <- if (gamma==1) 0 else -2*(ww*(x%*%db.drho))[nei$i,,drop=FALSE]
               var.mug <- variance(mu.cv)
               mevg <- mu.eta(eta.cv)
	       mug <- mu.cv
	       ww1 <- weights[nei$i]*(y[nei$i]-mug)*mevg/var.mug
	       ww1[!is.finite(ww1)] <- 0
	       ncv1 <- -2*ww1*deta.cv*gamma - (gamma-1)*dev1
             } ## if deriv
	   }

           if (deriv) {
             NCV1 <- colSums(ncv1) ## grad
	     #attr(NCV1,"gjk") <- gjk ## jackknife grad estimate - incomplete - need qapprox version as well
	     Vg <- crossprod(ncv1) ## empirical cov matrix of grad
           }

	   if (nei$jackknife>2) {
             nk <- c(nei$m[1],diff(nei$m)) ## dropped fold sizes
             jkw <- ((nobs-nk)/(nobs))^.4/sqrt(nk) ## jackknife weights
	     dd <-jkw*t(dd)%*%t(T)
	     #Vj <- crossprod(dd) ## jackknife cov matrix
             #attr(Vj,"dd") <- dd
             #attr(NCV,"Vj") <- Vj
	     attr(NCV,"dd") <- dd
	   }  
	   attr(NCV,"eta.cv") <- eta.cv
	   if (deriv) attr(NCV,"deta.cv") <- deta.cv
	} else { ## GCV/GACV etc ....

           P <- oo$P
           
           delta <- nobs - gamma * trA
           delta.2 <- delta*delta           
  
           GCV <- nobs*dev/delta.2
           GACV <- dev/nobs + P * 2*gamma*trA/(delta * nobs) 

           UBRE <- dev/nobs - 2*delta*scale/nobs + scale
        
           if (deriv) {
             trA1 <- oo$trA1
           
             D1 <- oo$D1
             P1 <- oo$P1
          
             if (sum(!is.finite(D1))||sum(!is.finite(P1))||sum(!is.finite(trA1))) { 
                 stop(
               "Non-finite derivatives. Try decreasing fit tolerance! See `epsilon' in `gam.contol'")
             }
         
             delta.3 <- delta*delta.2
  
             GCV1 <- nobs*D1/delta.2 + 2*nobs*dev*trA1*gamma/delta.3
             GACV1 <- D1/nobs + 2*P/delta.2 * trA1 + 2*gamma*trA*P1/(delta*nobs)

             UBRE1 <- D1/nobs + gamma * trA1 *2*scale/nobs
             if (deriv==2) {
               trA2 <- matrix(oo$trA2,nSp,nSp) 
               D2 <- matrix(oo$D2,nSp,nSp)
               P2 <- matrix(oo$P2,nSp,nSp)
              
               if (sum(!is.finite(D2))||sum(!is.finite(P2))||sum(!is.finite(trA2))) { 
                 stop(
                 "Non-finite derivatives. Try decreasing fit tolerance! See `epsilon' in `gam.contol'")
               }
             
               GCV2 <- outer(trA1,D1)
               GCV2 <- (GCV2 + t(GCV2))*gamma*2*nobs/delta.3 +
                      6*nobs*dev*outer(trA1,trA1)*gamma*gamma/(delta.2*delta.2) + 
                      nobs*D2/delta.2 + 2*nobs*dev*gamma*trA2/delta.3  
               GACV2 <- D2/nobs + outer(trA1,trA1)*4*P/(delta.3) +
                      2 * P * trA2 / delta.2 + 2 * outer(trA1,P1)/delta.2 +
                      2 * outer(P1,trA1) *(1/(delta * nobs) + trA/(nobs*delta.2)) +
                      2 * trA * P2 /(delta * nobs) 
               GACV2 <- (GACV2 + t(GACV2))*.5
               UBRE2 <- D2/nobs +2*gamma * trA2 * scale / nobs
             } ## end if (deriv==2)
           } ## end if (deriv)
        } ## end !REML
        # end of inserted code
        if (!conv) 
              warn[[length(warn)+1]] <- "gam.fit3 algorithm did not converge"
        if (boundary) 
              warn[[length(warn)+1]] <- "gam.fit3 algorithm stopped at boundary value"
        eps <- 10 * .Machine$double.eps
        if (printWarn&&family$family[1] == "binomial") {
            if (any(mu > 1 - eps) || any(mu < eps)) 
                warn[[length(warn)+1]] <- "gam.fit3 fitted probabilities numerically 0 or 1 occurred"
        }
        if (printWarn&&family$family[1] == "poisson") {
            if (any(mu < eps)) 
                warn[[length(warn)+1]] <- "gam.fit3 fitted rates numerically 0 occurred"
        }
 
        residuals <- rep.int(NA, nobs)
        residuals[good] <- z - (eta - offset)[good]
          
        ## undo reparameterization....
        coef <- as.numeric(T %*% coef)
        rV <- T %*% rV
        names(coef) <- xnames 
    } ### end if (!EMPTY)
    names(residuals) <- ynames
    names(mu) <- ynames
    names(eta) <- ynames
    ww <- wt <- rep.int(0, nobs)
    if (fisher) { wt[good] <- w; ww <- wt} else { 
      wt[good] <- wf  ## note that Fisher weights are returned
      ww[good] <- w
    }
    names(wt) <- ynames
    names(weights) <- ynames
    names(y) <- ynames
    if (deriv) {
      db.drho <- T%*%db.drho
      if (nrow(dw.drho)!=nrow(x)) {
        w1 <- dw.drho
        dw.drho <- matrix(0,nrow(x),ncol(w1))
        dw.drho[good,] <- w1
      }
    }
    
    sumw <- sum(weights)
    wtdmu <- if (intercept) 
        sum(weights * y)/sumw
    else linkinv(offset)
    nulldev <- sum(dev.resids(y, wtdmu, weights))
    n.ok <- nobs - sum(weights == 0)
    nulldf <- n.ok - as.integer(intercept)

    ## use exact MLE scale param for aic in gaussian case, otherwise scale.est (unless known)
    
    dev1 <- if (scale.known) scale*sumw else if (family$family=="gaussian") dev else
            if (is.na(reml.scale)) scale.est*sumw else reml.scale*sumw  

    aic.model <- aic(y, n, mu, weights, dev1) # note: incomplete 2*edf needs to be added
    if (control$trace) {
      t1 <- proc.time()
      at <- sum((t1-t0)[c(1,4)])
      cat("Proportion time in C: ",(tc+tg)/at," ls:",tc/at," gdi:",tg/at,"\n")
    } 
   
    list(coefficients = coef, residuals = residuals, fitted.values = mu, 
         family = family, linear.predictors = eta, deviance = dev, 
        null.deviance = nulldev, iter = iter, weights = wt, working.weights=ww,prior.weights = weights, z=z,
        df.null = nulldf, y = y, converged = conv,##pearson.warning = pearson.warning,
        boundary = boundary,D1=D1,D2=D2,P=P,P1=P1,P2=P2,trA=trA,trA1=trA1,trA2=trA2,NCV=NCV,NCV1=NCV1,
        GCV=GCV,GCV1=GCV1,GCV2=GCV2,GACV=GACV,GACV1=GACV1,GACV2=GACV2,UBRE=UBRE,
        UBRE1=UBRE1,UBRE2=UBRE2,REML=REML,REML1=REML1,REML2=REML2,rV=rV,Vg=Vg,db.drho=db.drho,
        dw.drho=dw.drho,dVkk = matrix(oo$dVkk,nSp,nSp),ldetS1 = if (grderiv) rp$det1 else 0,
        scale.est=scale.est,reml.scale= reml.scale,aic=aic.model,rank=oo$rank.est,K=Kmat,warn=warn)
} ## end gam.fit3

Vb.corr <- function(X,L,lsp0,S,off,dw,w,rho,Vr,nth=0,scale.est=FALSE) {
## compute higher order Vb correction...
## If w is NULL then X should be root Hessian, and 
## dw is treated as if it was 0, otherwise X should be model 
## matrix.
## dw is derivative w.r.t. all the smoothing parameters and family parameters as if these 
## were not linked, but not the scale parameter, of course. Vr includes scale uncertainty,
## if scale estimated...
## nth is the number of initial elements of rho that are not smoothing 
## parameters, scale.est is TRUE if scale estimated by REML and must be
## dropped from s.p.s
  M <- length(off) ## number of penalty terms
  if (scale.est) {
    ## drop scale param from L, rho and Vr...
    rho <- rho[-length(rho)]
    if (!is.null(L)) L <- L[-nrow(L),-ncol(L),drop=FALSE]
    Vr <- Vr[-nrow(Vr),-ncol(Vr),drop=FALSE]
  }
 
  if (is.null(lsp0)) lsp0 <- if (is.null(L)) rho*0 else rep(0,nrow(L))
  ## note that last element of lsp0 can be a scale parameter...
  lambda <- if (is.null(L)) exp(rho+lsp0[1:length(rho)]) else exp(L%*%rho + lsp0[1:nrow(L)])
  
  ## Re-create the Hessian, if is.null(w) then X assumed to be root
  ## unpenalized Hessian...
  H <- if (is.null(w)) crossprod(X) else H <- t(X)%*%(w*X)
  if (M>0) for (i in 1:M) {
      ind <- off[i] + 1:ncol(S[[i]]) - 1
      H[ind,ind] <- H[ind,ind] + lambda[i+nth] * S[[i]]
  }

  R <- try(chol(H),silent=TRUE) ## get its Choleski factor.  
  if (inherits(R,"try-error")) return(0) ## bail out as Hessian insufficiently well conditioned
  
  ## Create dH the derivatives of the hessian w.r.t. (all) the smoothing parameters...
  dH <- list()
  if (length(lambda)>0) for (i in 1:length(lambda)) {
    ## If w==NULL use constant H approx...
    dH[[i]] <- if (is.null(w)) H*0 else t(X)%*%(dw[,i]*X) 
    if (i>nth) { 
      ind <- off[i-nth] + 1:ncol(S[[i-nth]]) - 1
      dH[[i]][ind,ind] <- dH[[i]][ind,ind] + lambda[i]*S[[i-nth]]
    }
  }
  ## If L supplied then dH has to be re-weighted to give
  ## derivatives w.r.t. optimization smoothing params.
  if (!is.null(L)) {
    dH1 <- dH;dH <- list()
    if (length(rho)>0) for (j in 1:length(rho)) { 
      ok <- FALSE ## dH[[j]] not yet created
      if (nrow(L)>0) for (i in 1:nrow(L)) if (L[i,j]!=0.0) { 
        dH[[j]] <- if (ok) dH[[j]] + dH1[[i]]*L[i,j] else dH1[[i]]*L[i,j]
        ok <- TRUE
      }
    } 
    rm(dH1)
  } ## dH now w.r.t. optimization parameters 
  
  if (length(dH)==0) return(0) ## nothing to correct

  ## Get derivatives of Choleski factor w.r.t. the smoothing parameters 
  dR <- list()
  for (i in 1:length(dH)) dR[[i]] <- dchol(dH[[i]],R) 
  rm(dH)
  
  ## need to transform all dR to dR^{-1} = -R^{-1} dR R^{-1}...
  for (i in 1:length(dR)) dR[[i]] <- -t(forwardsolve(t(R),t(backsolve(R,dR[[i]]))))
 
  ## BUT: dR, now upper triangular, and it relates to RR' = Vb not R'R = Vb
  ## in consequence of which Rz is the thing with the right distribution
  ## and not R'z...
  dbg <- FALSE
  if (dbg) { ## debugging code
    n.rep <- 10000;p <- ncol(R)
    r <- rmvn(n.rep,rep(0,M),Vr)
    b <- matrix(0,n.rep,p)
    for (i in 1:n.rep) {
      z <- rnorm(p)
      if (M>0) for (j in 1:M) b[i,] <- b[i,] + dR[[j]]%*%z*(r[i,j]) 
    }
    Vfd <- crossprod(b)/n.rep
  }

  vcorr(dR,Vr,FALSE) ## NOTE: unscaled!!
} ## Vb.corr

gam.fit3.post.proc <- function(X,L,lsp0,S,off,object,gamma) {
## get edf array and covariance matrices after a gam fit. 
## X is original model matrix, L the mapping from working to full sp
  scale <- if (object$scale.estimated) object$scale.est else object$scale
  Vb <- object$rV%*%t(object$rV)*scale ## Bayesian cov.

  # PKt <- object$rV%*%t(object$K)
  PKt <- .Call(C_mgcv_pmmult2,object$rV,object$K,0,1,object$control$nthreads)
  # F <- PKt%*%(sqrt(object$weights)*X)
  F <- .Call(C_mgcv_pmmult2,PKt,sqrt(object$weights)*X,0,0,object$control$nthreads)
  edf <- diag(F) ## effective degrees of freedom
  edf1 <- 2*edf - rowSums(t(F)*F) ## alternative

  ## edf <- rowSums(PKt*t(sqrt(object$weights)*X))
  ## Ve <- PKt%*%t(PKt)*object$scale  ## frequentist cov
  Ve <- F%*%Vb ## not quite as stable as above, but quicker
  hat <- rowSums(object$K*object$K)
  ## get QR factor R of WX - more efficient to do this
  ## in gdi_1 really, but that means making QR of augmented 
  ## a two stage thing, so not clear cut...
  WX <- sqrt(object$weights)*X
  qrx <- pqr(WX,object$control$nthreads)
  R <- pqr.R(qrx);R[,qrx$pivot] <- R
  if (!is.na(object$reml.scale)&&!is.null(object$db.drho)) { ## compute sp uncertainty correction
    hess <- object$outer.info$hess
    edge.correct <- if (is.null(attr(hess,"edge.correct"))) FALSE else TRUE
    K <- if (edge.correct) 2 else 1
    for (k in 1:K) {
      if (k==1) { ## fitted model computations
        db.drho <- object$db.drho
        dw.drho <- object$dw.drho
        lsp <- log(object$sp)
      } else { ## edge corrected model computations
        db.drho <- attr(hess,"db.drho1")
        dw.drho <- attr(hess,"dw.drho1")
        lsp <- attr(hess,"lsp1")
	hess <- attr(hess,"hess1")
      }
      M <- ncol(db.drho)
      ## transform to derivs w.r.t. working, noting that an extra final row of L
      ## may be present, relating to scale parameter (for which db.drho is 0 since it's a scale parameter)  
      if (!is.null(L)) { 
        db.drho <- db.drho%*%L[1:M,,drop=FALSE] 
        M <- ncol(db.drho)
      }
      ## extract cov matrix for log smoothing parameters...
      ev <- eigen(hess,symmetric=TRUE) 
      d <- ev$values;ind <- d <= 0
      d[ind] <- 0;d[!ind] <- 1/sqrt(d[!ind])
      rV <- (d*t(ev$vectors))[,1:M] ## root of cov matrix
      Vc <- crossprod(rV%*%t(db.drho))
      ## set a prior precision on the smoothing parameters, but don't use it to 
      ## fit, only to regularize Cov matrix. exp(4*var^.5) gives approx 
      ## multiplicative range. e.g. var = 5.3 says parameter between .01 and 100 times
      ## estimate. Avoids nonsense at `infinite' smoothing parameters.   
      d <- ev$values; d[ind] <- 0;
      d <- if (k==1) 1/sqrt(d+1/10) else 1/sqrt(d+1e-7)
      Vr <- crossprod(d*t(ev$vectors))
      ## Note that db.drho and dw.drho are derivatives w.r.t. full set of smoothing 
      ## parameters excluding any scale parameter, but Vr includes info for scale parameter
      ## if it has been estimated. 
      nth <- if (is.null(object$family$n.theta)) 0 else object$family$n.theta ## any parameters of family itself
      drop.scale <- object$scale.estimated && !(object$method %in% c("P-REML","P-ML"))
      Vc2 <- scale*Vb.corr(R,L,lsp0,S,off,dw.drho,w=NULL,lsp,Vr,nth,drop.scale)
    
      Vc <- Vb + Vc + Vc2 ## Bayesian cov matrix with sp uncertainty
      ## finite sample size check on edf sanity...
      if (k==1) { ## compute edf2 only with fitted model, not edge corrected
        edf2 <- rowSums(Vc*crossprod(R))/scale
        if (sum(edf2)>sum(edf1)) { 
          edf2 <- edf1
        }
      }
    } ## k loop

    V.sp <- Vr;attr(V.sp,"L") <- L;attr(V.sp,"spind") <- spind <- (nth+1):M
    ## EXPERIMENTAL ## NOTE: no L handling - what about incomplete z/w??
    #P <- Vb %*% X/scale
    if (FALSE) {
      sp <- object$sp[spind]
      beta <- object$coefficients
      w <- object$weights
      m <- length(off)
      M <- K <- matrix(0,length(w),m)
      for (i in 1:m) {
        ii <- 1:ncol(S[[i]])+off[i]-1
        Sb <- S[[i]] %*% beta[ii]*sp[i]/object$scale
	## don't forget Vb is inverse hessian times scale!
        K[,i] <- w * drop(X %*% (Vb[,ii] %*% Sb))/object$scale
        B <- Vb%*%drop(t(w*object$z) %*% X)
        M[,i] <- X%*% (Vb[,ii] %*% (S[[i]]%*%B[ii]))*sp[i]/object$scale^2
      }
      ## Should following really just drop scale term?
      K <- K %*% Vr[spind,spind]
      edf3 <- sum(K*M)
      attr(edf2,"edf3") <- edf3
    }
    ## END EXPERIMENTAL
  } else V.sp <- edf2 <- Vc <- NULL
  ret <- list(Vp=Vb,Ve=Ve,V.sp=V.sp,edf=edf,edf1=edf1,edf2=edf2,hat=hat,F=F,R=R)
  if (is.null(object$Vc)) ret$Vc <- Vc
  ret
} ## gam.fit3.post.proc


score.transect <- function(ii, x, y, sp, Eb,UrS=list(), 
            weights = rep(1, length(y)), start = NULL, etastart = NULL, 
            mustart = NULL, offset = rep(0, length(y)),U1,Mp,family = gaussian(), 
            control = gam.control(), intercept = TRUE,deriv=2,
            gamma=1,scale=1,printWarn=TRUE,scoreType="REML",eps=1e-7,null.coef=rep(0,ncol(x)),...) {
## plot a transect through the score for sp[ii]
  np <- 200
  if (scoreType%in%c("REML","P-REML","ML","P-ML")) reml <- TRUE else reml <- FALSE
  
  score <- spi <- seq(-30,30,length=np)
  for (i in 1:np) {

     sp[ii] <- spi[i]
     b<-gam.fit3(x=x, y=y, sp=sp,Eb=Eb,UrS=UrS,
      offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=0,
      control=control,gamma=gamma,scale=scale,
      printWarn=FALSE,mustart=mustart,scoreType=scoreType,null.coef=null.coef,...)

      if (reml) {
        score[i] <- b$REML
      } else if (scoreType=="GACV") {
        score[i] <- b$GACV
      } else if (scoreType=="UBRE"){
        score[i] <- b$UBRE 
      } else { ## default to deviance based GCV
        score[i] <- b$GCV
      }
  }
  par(mfrow=c(2,2),mar=c(4,4,1,1))
  plot(spi,score,xlab="log(sp)",ylab=scoreType,type="l")
  plot(spi[1:(np-1)],score[2:np]-score[1:(np-1)],type="l",ylab="differences")
  plot(spi,score,ylim=c(score[1]-.1,score[1]+.1),type="l")
  plot(spi,score,ylim=c(score[np]-.1,score[np]+.1),type="l")
} ## score.transect

deriv.check <- function(x, y, sp, Eb,UrS=list(), 
            weights = rep(1, length(y)), start = NULL, etastart = NULL, 
            mustart = NULL, offset = rep(0, length(y)),U1,Mp,family = gaussian(), 
            control = gam.control(), intercept = TRUE,deriv=2,
            gamma=1,scale=1,printWarn=TRUE,scoreType="REML",eps=1e-7,
            null.coef=rep(0,ncol(x)),Sl=Sl,nei=nei,...)
## FD checking of derivatives: basically a debugging routine
{  
   if (!deriv%in%c(1,2)) stop("deriv should be 1 or 2")
   if (control$epsilon>1e-9) control$epsilon <- 1e-9 
   b<-gam.fit3(x=x, y=y, sp=sp,Eb=Eb,UrS=UrS,
      offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=deriv,
      control=control,gamma=gamma,scale=scale,printWarn=FALSE,
      start=start,etastart=etastart,mustart=mustart,scoreType=scoreType,
      null.coef=null.coef,Sl=Sl,nei=nei,...)

   P0 <- b$P;fd.P1 <- P10 <- b$P1;  if (deriv==2) fd.P2 <- P2 <- b$P2 
   trA0 <- b$trA;fd.gtrA <- gtrA0 <- b$trA1 ; if (deriv==2) fd.htrA <- htrA <- b$trA2 
   dev0 <- b$deviance;fd.D1 <- D10 <- b$D1 ; if (deriv==2) fd.D2 <- D2 <- b$D2 
   fd.db <- b$db.drho*0
   
   if (scoreType%in%c("REML","P-REML","ML","P-ML")) reml <- TRUE else reml <- FALSE
   sname <- if (reml) "REML" else scoreType
   sname1 <- paste(sname,"1",sep=""); sname2 <- paste(sname,"2",sep="")
   if (scoreType=="NCV") reml <- TRUE ## to avoid un-needed stuff

   score0 <- b[[sname]];grad0 <- b[[sname1]]; if (deriv==2) hess <- b[[sname2]] 


   fd.grad <- grad0*0
   if (deriv==2) fd.hess <- hess
   diter <- rep(20,length(sp))
   for (i in 1:length(sp)) {
     sp1 <- sp;sp1[i] <- sp[i]+eps/2
     bf<-gam.fit3(x=x, y=y, sp=sp1,Eb=Eb,UrS=UrS,
      offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=deriv,
      control=control,gamma=gamma,scale=scale,printWarn=FALSE,
      start=start,etastart=etastart,mustart=mustart,scoreType=scoreType,
      null.coef=null.coef,Sl=Sl,nei=nei,...)
      
     sp1 <- sp;sp1[i] <- sp[i]-eps/2
     bb<-gam.fit3(x=x, y=y, sp=sp1, Eb=Eb,UrS=UrS,
      offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=deriv,
      control=control,gamma=gamma,scale=scale,printWarn=FALSE,
      start=start,etastart=etastart,mustart=mustart,scoreType=scoreType,
     null.coef=null.coef,Sl=Sl,nei=nei,...)
     diter[i] <- bf$iter - bb$iter ## check iteration count same 
     if (i<=ncol(fd.db)) fd.db[,i] <- (bf$coefficients - bb$coefficients)/eps


      if (!reml) {
        Pb <- bb$P;Pf <- bf$P 
        P1b <- bb$P1;P1f <- bf$P1
        trAb <- bb$trA;trAf <- bf$trA
        gtrAb <- bb$trA1;gtrAf <- bf$trA1
        devb <- bb$deviance;devf <- bf$deviance
        D1b <- bb$D1;D1f <- bf$D1
      }

      scoreb <- bb[[sname]];scoref <- bf[[sname]];
      if (deriv==2) { gradb <- bb[[sname1]];gradf <- bf[[sname1]]}

      if (!reml) {
        fd.P1[i] <- (Pf-Pb)/eps
        fd.gtrA[i] <- (trAf-trAb)/eps
        fd.D1[i] <- (devf - devb)/eps
      }
      
     
      fd.grad[i] <- (scoref-scoreb)/eps
      if (deriv==2) { 
        fd.hess[,i] <- (gradf-gradb)/eps
        if (!reml) {
          fd.htrA[,i] <- (gtrAf-gtrAb)/eps
          fd.P2[,i] <- (P1f-P1b)/eps
          fd.D2[,i] <- (D1f-D1b)/eps
        } 
       
      }
   }
   
   if (!reml) {
     cat("\n Pearson Statistic... \n")
     cat("grad    ");print(P10)
     cat("fd.grad ");print(fd.P1)
     if (deriv==2) {
       fd.P2 <- .5*(fd.P2 + t(fd.P2))
       cat("hess\n");print(P2)
       cat("fd.hess\n");print(fd.P2)
     }

     cat("\n\n tr(A)... \n")
     cat("grad    ");print(gtrA0)
     cat("fd.grad ");print(fd.gtrA)
     if (deriv==2) {
       fd.htrA <- .5*(fd.htrA + t(fd.htrA))
       cat("hess\n");print(htrA)
       cat("fd.hess\n");print(fd.htrA)
     }
   

     cat("\n Deviance... \n")
     cat("grad    ");print(D10)
     cat("fd.grad ");print(fd.D1)
     if (deriv==2) {
       fd.D2 <- .5*(fd.D2 + t(fd.D2))
       cat("hess\n");print(D2)
       cat("fd.hess\n");print(fd.D2)
     }
   }

   plot(b$db.drho,fd.db,pch=".")
   for (i in 1:ncol(fd.db)) points(b$db.drho[,i],fd.db[,i],pch=19,cex=.3,col=i)

  
   cat("\n\n The objective...\n")
   cat("diter    ");print(diter)
   cat("grad    ");print(as.numeric(grad0))
   cat("fd.grad ");print(as.numeric(fd.grad))
   if (deriv==2) {
     fd.hess <- .5*(fd.hess + t(fd.hess))
     cat("hess\n");print(hess)
     cat("fd.hess\n");print(fd.hess)
   }
   NULL
} ## deriv.check


rt <- function(x,r1) {
## transform of x, asymptoting to values in r1
## returns derivatives wrt to x as well as transform values
## r1[i] == NA for no transform 
  x <- as.numeric(x)
  ind <- x>0 
  rho2 <- rho1 <- rho <- 0*x
  if (length(r1)==1) r1 <- x*0+r1
  h <- exp(x[ind])/(1+exp(x[ind]))
  h1 <- h*(1-h);h2 <- h1*(1-2*h)
  rho[ind] <- r1[ind]*(h-0.5)*2
  rho1[ind] <- r1[ind]*h1*2
  rho2[ind] <- r1[ind]*h2*2
  rho[!ind] <- r1[!ind]*x[!ind]/2
  rho1[!ind] <- r1[!ind]/2
  ind <- is.na(r1)
  rho[ind] <- x[ind]
  rho1[ind] <- 1
  rho2[ind] <- 0
  list(rho=rho,rho1=rho1,rho2=rho2)
} ## rt

rti <- function(r,r1) {
## inverse of rti.
  r <- as.numeric(r)
  ind <- r>0
  x <- r
  if (length(r1)==1) r1 <- x*0+r1
  r2 <- r[ind]*.5/r1[ind] + .5
  x[ind] <- log(r2/(1-r2))
  x[!ind] <- 2*r[!ind]/r1[!ind]
  ind <- is.na(r1)
  x[ind] <- r[ind]
  x
} ## rti

simplyFit <- function(lsp,X,y,Eb,UrS,L,lsp0,offset,U1,Mp,family,weights,
                   control,gamma,scale,conv.tol=1e-6,maxNstep=5,maxSstep=2,
                   maxHalf=30,printWarn=FALSE,scoreType="deviance",
                   mustart = NULL,null.coef=rep(0,ncol(X)),Sl=Sl,nei=NULL,...)
## function with same argument list as `newton' and `bfgs' which simply fits
## the model given the supplied smoothing parameters...
{ reml <- scoreType%in%c("REML","P-REML","ML","P-ML") ## REML/ML indicator
  sname <- if (reml) "REML" else scoreType
  
  ## sanity check L
  if (is.null(L)) L <- diag(length(lsp)) else {
    if (!inherits(L,"matrix")) stop("L must be a matrix.")
    if (nrow(L)<ncol(L)) stop("L must have at least as many rows as columns.")
    if (nrow(L)!=length(lsp0)||ncol(L)!=length(lsp)) stop("L has inconsistent dimensions.")
  }
  if (is.null(lsp0)) lsp0 <- rep(0,ncol(L))
  ## initial fit

  b<-gam.fit3(x=X, y=y, sp=L%*%lsp+lsp0, Eb=Eb,UrS=UrS,
     offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=0,
     control=control,gamma=gamma,scale=scale,
     printWarn=FALSE,mustart=mustart,scoreType=scoreType,null.coef=null.coef,Sl=Sl,nei=nei,...)

  if (!is.null(b$warn)&&length(b$warn)>0) for (i in 1:length(b$warn)) warning(b$warn[[i]])
  score <- b[[sname]]

  list(score=score,lsp=lsp,lsp.full=L%*%lsp+lsp0,grad=NULL,hess=NULL,score.hist=NULL,iter=0,conv =NULL,object=b)

} ## simplyFit


newton <- function(lsp,X,y,Eb,UrS,L,lsp0,offset,U1,Mp,family,weights,
                   control,gamma,scale,conv.tol=1e-6,maxNstep=5,maxSstep=2,
                   maxHalf=30,printWarn=FALSE,scoreType="deviance",start=NULL,
                   mustart = NULL,null.coef=rep(0,ncol(X)),pearson.extra,
                   dev.extra=0,n.true=-1,Sl=NULL,edge.correct=FALSE,nei=NULL,...)
## Newton optimizer for GAM reml/gcv/aic optimization that can cope with an 
## indefinite Hessian. Main enhancements are: 
## i) always perturbs the Hessian to +ve definite if indefinite 
## ii) step halves on step failure, without obtaining derivatives until success; 
## (iii) carries start values forward from one evaluation to next to speed convergence;
## iv) Always tries the steepest descent direction as well as the 
##     Newton direction for indefinite problems (step length on steepest trial could
##     be improved here - currently simply halves until descent achieved).    
## L is the matrix such that L%*%lsp + lsp0 gives the logs of the smoothing 
## parameters actually multiplying the S[[i]]'s
## NOTE: an obvious acceleration would use db/dsp to produce improved
##       starting values at each iteration... 
{ ##  inner iteration results need to be accurate enough for conv.tol...
  if (control$epsilon>conv.tol/100) control$epsilon <- conv.tol/100 

  reml <- scoreType%in%c("REML","P-REML","ML","P-ML") ## REML/ML indicator

  sname <- if (reml) "REML" else scoreType
  sname1 <- paste(sname,"1",sep=""); sname2 <- paste(sname,"2",sep="")

  ## sanity check L
  if (is.null(L)) L <- diag(length(lsp)) else {
    if (!inherits(L,"matrix")) stop("L must be a matrix.")
    if (nrow(L)<ncol(L)) stop("L must have at least as many rows as columns.")
    if (nrow(L)!=length(lsp0)||ncol(L)!=length(lsp)) stop("L has inconsistent dimensions.")
  }
  if (is.null(lsp0)) lsp0 <- rep(0,nrow(L)) 

  if (reml&&FALSE) { ## NOTE: routine set up to allow upper limits on lsp, but turned off.
    frob.X <- sqrt(sum(X*X))
    lsp.max <- rep(NA,length(lsp0))
    for (i in 1:nrow(L)) { 
      lsp.max[i] <- 16 + log(frob.X/sqrt(sum(UrS[[i]]^2))) - lsp0[i]
      if (lsp.max[i]<2) lsp.max[i] <- 2
    } 
  } else lsp.max <- NULL

  if (!is.null(lsp.max)) { ## then there are upper limits on lsp's
    lsp1.max <- coef(lm(lsp.max-lsp0~L-1)) ## get upper limits on lsp1 scale
    ind <- lsp>lsp1.max
    lsp[ind] <- lsp1.max[ind]-1 ## reset lsp's already over limit
    delta <- rti(lsp,lsp1.max) ## initial optimization parameters
  } else { ## optimization parameters are just lsp
    delta <- lsp
  }

  ## code designed to be turned on during debugging...
  check.derivs <- FALSE;sp.trace <- FALSE
  if (check.derivs) {
     deriv <- 2
     eps <- 1e-4
     deriv.check(x=X, y=y, sp=L%*%lsp+lsp0, Eb=Eb,UrS=UrS,
         offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=deriv,
         control=control,gamma=gamma,scale=scale,
         printWarn=FALSE,start=start,mustart=mustart,
         scoreType=scoreType,eps=eps,null.coef=null.coef,Sl=Sl,nei=nei,...)

     
  }

  ## ... end of debugging code 


  ## initial fit
  initial.lsp <- lsp ## used if edge correcting to set direction of correction
  b<-gam.fit3(x=X, y=y, sp=L%*%lsp+lsp0,Eb=Eb,UrS=UrS,
     offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=2,
     control=control,gamma=gamma,scale=scale,printWarn=FALSE,start=start,
     mustart=mustart,scoreType=scoreType,null.coef=null.coef,pearson.extra=pearson.extra,
     dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)

  mustart <- b$fitted.values
  etastart <- b$linear.predictors
  start <- b$coefficients
  old.score <- score <- b[[sname]];grad <- b[[sname1]];hess <- b[[sname2]]
  
  grad <- t(L)%*%grad
  hess <- t(L)%*%hess%*%L

  if (!is.null(lsp.max)) { ## need to transform to delta space
    rho <- rt(delta,lsp1.max)
    nr <- length(rho$rho1)
    hess <- diag(rho$rho1,nr,nr)%*%hess%*%diag(rho$rho1,nr,nr) + diag(rho$rho2*grad)
    grad <- rho$rho1*grad
  }

  if (reml) score.scale <- abs(log(b$scale.est)) + abs(score) else 
  score.scale <- b$scale.est + abs(score)    
  uconv.ind <- abs(grad) > score.scale*conv.tol
  ## check for all converged too soon, and undo !
  if (!sum(uconv.ind)) uconv.ind <- uconv.ind | TRUE
  score.hist <- rep(NA,200)
  ################################
  ## Start of Newton iteration.... 
  ################################
  qerror.thresh <- .8 ## quadratic approx error to tolerate in a step
  for (i in 1:200) {
   if (control$trace) {
     cat("\n",i,"newton max(|grad|) =",max(abs(grad)),"\n")
   }
   ## debugging code for checking derivatives ....
   okc <- check.derivs 
   while (okc) {
     okc <- FALSE
     eps <- 1e-4
     deriv <- 2
     if (okc) { ## optional call to fitting to facilitate debugging 
       trial.der <- 2 ## can reset if derivs not wanted
       b <- gam.fit3(x=X, y=y, sp=L%*%lsp+lsp0,Eb=Eb,UrS=UrS,
         offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=trial.der,
         control=control,gamma=gamma,scale=scale,printWarn=FALSE,start=start,
         mustart=mustart,scoreType=scoreType,null.coef=null.coef,
         pearson.extra=pearson.extra,dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)  
     }
     deriv.check(x=X, y=y, sp=L%*%lsp+lsp0, Eb=Eb,UrS=UrS,
         offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=deriv,
         control=control,gamma=gamma,scale=scale,
         printWarn=FALSE,etastart=etastart,start=start,
         scoreType=scoreType,eps=eps,null.coef=null.coef,Sl=Sl,nei=nei,...)
     if (inherits(family,"general.family")) { ## call gam.fit5 checking
       eps <- 1e-6
       spe <- 1e-3
       er <- deriv.check5(x=X, y=y, sp=L%*%lsp+lsp0, 
            weights = weights, start = start,
            offset = offset,Mp=Mp,family = family, 
            control = control,deriv=deriv,eps=eps,spe=spe,
            Sl=Sl,nei=nei,...) ## ignore codetools warning
   
     }
   } ## end of derivative checking

    ## exclude dimensions from Newton step when the derviative is
    ## tiny relative to largest, as this space is likely to be poorly
    ## modelled on scale of Newton step...
    
    uconv.ind1 <- uconv.ind & abs(grad)>max(abs(grad))*.001 
    if (sum(uconv.ind1)==0) uconv.ind1 <- uconv.ind ## nothing left reset
    if (sum(uconv.ind)==0) uconv.ind[which(abs(grad)==max(abs(grad)))] <- TRUE ## need at least 1 to update
    
    ## exclude apparently converged gradients from computation
    hess1 <- hess[uconv.ind,uconv.ind] 
    grad1 <- grad[uconv.ind]
    ## get the trial step ...
    eh <- eigen(hess1,symmetric=TRUE)
    d <- eh$values;U <- eh$vectors
    indef <- (sum(-d > abs(d[1])*.Machine$double.eps^.5)>0) ## indefinite problem
    ## need a different test if there is only one smoothing parameter,
    ## otherwise infinite sp can count as always indefinite...
    if (indef && length(d)==1) indef <- d < -score.scale * .Machine$double.eps^.5
    ## set eigen-values to their absolute value - heuristically appealing
    ## as it avoids very long steps being proposed for indefinte components,
    ## unlike setting -ve e.v.s to very small +ve constant...
    ind <- d < 0
    pdef <- if (sum(ind)>0) FALSE else TRUE ## is it positive definite? 
    d[ind] <- -d[ind] ## see Gill Murray and Wright p107/8
    low.d <- max(d)*.Machine$double.eps^.7
    ind <- d < low.d
    if (sum(ind)>0) pdef <- FALSE ## not certain it is positive definite
    d[ind] <- low.d 
    ind <- d != 0
    d[ind] <- 1/d[ind]
    
    Nstep <- 0 * grad
    Nstep[uconv.ind] <- -drop(U%*%(d*(t(U)%*%grad1))) # (modified) Newton direction
   
    Sstep <- -grad/max(abs(grad)) # steepest descent direction 

    ms <- max(abs(Nstep)) ## note smaller permitted step if !pdef
    mns <- maxNstep

    if (ms>maxNstep) Nstep <- mns * Nstep/ms
    
    sd.unused <- TRUE ## steepest descent direction not yet tried

    ## try the step ...
    if (sp.trace) cat(lsp,"\n")

    if (!is.null(lsp.max)) { ## need to take step in delta space
      delta1 <- delta + Nstep
      lsp1 <- rt(delta1,lsp1.max)$rho ## transform to log sp space
      while (max(abs(lsp1-lsp))>maxNstep) { ## make sure step is not too long
        Nstep <- Nstep / 2 
        delta1 <- delta + Nstep
        lsp1 <- rt(delta1,lsp1.max)$rho
      } 
    } else lsp1 <- lsp + Nstep

    ## if pdef==TRUE then get grad and hess immediately, otherwise postpone as
    ## the steepest descent direction should be tried as well as Newton

    b <- gam.fit3(x=X, y=y, sp=L%*%lsp1+lsp0,Eb=Eb,UrS=UrS,
         offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=as.numeric(pdef)*2,
         control=control,gamma=gamma,scale=scale,printWarn=FALSE,start=start,
         mustart=mustart,scoreType=scoreType,null.coef=null.coef,
         pearson.extra=pearson.extra,dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)    

    ## get the change predicted for this step according to the quadratic model
    pred.change <- sum(grad*Nstep) + 0.5*t(Nstep) %*% hess %*% Nstep
    score1 <- b[[sname]]
 
    ## accept if improvement, else step halve
    ii <- 0 ## step halving counter
    score.change <- score1 - score
    qerror <- abs(pred.change-score.change)/(max(abs(pred.change),abs(score.change))+score.scale*conv.tol) ## quadratic approx error
    if (is.finite(score1) && score.change<0 && pdef && qerror < qerror.thresh) { ## immediately accept step if it worked and positive definite
      old.score <- score 
      mustart <- b$fitted.values
      etastart <- b$linear.predictors
      start <- b$coefficients
      lsp <- lsp1
      score <- b[[sname]]; grad <- b[[sname1]]; hess <- b[[sname2]] 
    
      grad <- t(L)%*%grad
      hess <- t(L)%*%hess%*%L
      
      if (!is.null(lsp.max)) { ## need to transform to delta space
        delta <- delta1
        rho <- rt(delta,lsp1.max)
        nr <- length(rho$rho1)
        hess <- diag(rho$rho1,nr,nr)%*%hess%*%diag(rho$rho1,nr,nr) + diag(rho$rho2*grad)
        grad <- rho$rho1*grad
      }

    } else if (!is.finite(score1) || score1>=score||qerror >= qerror.thresh) { ## initial step failed, try step halving ...
      step <- Nstep ## start with the (pseudo) Newton direction
      while ((!is.finite(score1) || score1>=score ||qerror >= qerror.thresh) && ii < maxHalf) {
        if (ii==3&&i<10) { ## Newton really not working - switch to SD, but keeping step length 
          s.length <- min(sum(step^2)^.5,maxSstep)
          step <- Sstep*s.length/sum(Sstep^2)^.5 ## use steepest descent direction
          sd.unused <- FALSE ## signal that SD already tried
        } else step <- step/2
        if (!is.null(lsp.max)) { ## need to take step in delta space
          delta1 <- delta + step
          lsp1 <- rt(delta1,lsp1.max)$rho ## transform to log sp space
        } else lsp1 <- lsp + step
        b1<-gam.fit3(x=X, y=y, sp=L%*%lsp1+lsp0,Eb=Eb,UrS=UrS,offset = offset,U1=U1,Mp=Mp,
	             family = family,weights=weights,deriv=0,control=control,gamma=gamma,
		     scale=scale,printWarn=FALSE,start=start,mustart=mustart,scoreType=scoreType,
		     null.coef=null.coef,pearson.extra=pearson.extra,dev.extra=dev.extra,
		     n.true=n.true,Sl=Sl,nei=nei,...)
        pred.change <- sum(grad*step) + 0.5*t(step) %*% hess %*% step ## Taylor prediction of change 
        score1 <- b1[[sname]]

	score.change <- score1 - score
	## don't allow step to fail altogether just because of qerror
	qerror <- if (ii>min(4,maxHalf/2)) qerror.thresh/2 else
	          abs(pred.change-score.change)/(max(abs(pred.change),abs(score.change))+score.scale*conv.tol) ## quadratic approx error
        if (is.finite(score1) && score.change < 0 && qerror < qerror.thresh) { ## accept
          if (pdef||!sd.unused) { ## then accept and compute derivatives
            b <- gam.fit3(x=X, y=y, sp=L%*%lsp1+lsp0,Eb=Eb,UrS=UrS,
                 offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=2,
                 control=control,gamma=gamma,scale=scale,printWarn=FALSE,start=start,
                 mustart=mustart,scoreType=scoreType,null.coef=null.coef,
                 pearson.extra=pearson.extra,dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)
            mustart <- b$fitted.values 
            etastart <- b$linear.predictors
            start <- b$coefficients
            old.score <- score;lsp <- lsp1
            score <- b[[sname]];grad <- b[[sname1]];hess <- b[[sname2]] 

            grad <- t(L)%*%grad
            hess <- t(L)%*%hess%*%L
            if (!is.null(lsp.max)) { ## need to transform to delta space
               delta <- delta1
               rho <- rt(delta,lsp1.max)
               nr <- length(rho$rho1)
               hess <- diag(rho$rho1,nr,nr)%*%hess%*%diag(rho$rho1,nr,nr) + diag(rho$rho2*grad)
               grad <- rho$rho1*grad
            }
          } else { ## still need to try the steepest descent step to see if it does better
            b <- b1
            score2 <- score1 ## store temporarily and restore below
          }
          score1 <- score - abs(score) - 1 ## make sure that score1 < score (restore once out of loop)
        }  # end of if (score1<= score ) # accept
        if (!is.finite(score1) || score1>=score || qerror >= qerror.thresh) ii <- ii + 1
      } ## end while (score1>score && ii < maxHalf)
      if (!pdef&&sd.unused&&ii<maxHalf) score1 <- score2 ## restored (not needed if termination on ii==maxHalf)
    } ## end of step halving branch

    ## if the problem is not positive definite, and the sd direction has not 
    ## yet been tried then it should be tried now, and the better of the 
    ## newton and steepest steps used (before computing the derivatives)
    ## score1, lsp1 (delta1) contain the best so far...

    if (!pdef&&sd.unused) {
      step <- Sstep*2
      kk <- 0;score2 <- NA
      ok <- TRUE
      while (ok) { ## step length loop for steepest....
        step <- step/2;kk <- kk+1
        if (!is.null(lsp.max)) { ## need to take step in delta space
          delta3 <- delta + step
          lsp3 <- rt(delta3,lsp1.max)$rho ## transform to log sp space
        } else lsp3 <- lsp + step
        b1 <- gam.fit3(x=X, y=y, sp=L%*%lsp3+lsp0,Eb=Eb,UrS=UrS,
              offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=0,
              control=control,gamma=gamma,scale=scale,
              printWarn=FALSE,start=start,mustart=mustart,scoreType=scoreType,
              null.coef=null.coef,pearson.extra=pearson.extra,
              dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)
	pred.change <- sum(grad*step) + 0.5*t(step) %*% hess %*% step ## Taylor prediction of change 
        score3 <- b1[[sname]]
      
	score.change <- score3 - score
        qerror <- abs(pred.change-score.change)/(max(abs(pred.change),abs(score.change))+score.scale*conv.tol) ## quadratic approx error 
        if (!is.finite(score2)||(is.finite(score3)&&score3<=score2&&qerror<qerror.thresh)) { ## record step - best SD step so far
          score2 <- score3
          lsp2 <- lsp3
          if (!is.null(lsp.max)) delta2 <- delta3
        }
        ## stop when improvement found, and shorter step is worse...
        if ((is.finite(score2)&&is.finite(score3)&&score2<score&&score3>score2)||kk==40) ok <- FALSE
      } ## while (ok) ## step length control loop

      ## now pick the step that led to the biggest decrease  

      if (is.finite(score2) && score2<score1) {
        lsp1 <- lsp2
        if (!is.null(lsp.max)) delta1 <- delta2
        score1 <- score2
      }

      ## and compute derivatives for the accepted step....

      b <- gam.fit3(x=X, y=y, sp=L%*%lsp1+lsp0,Eb=Eb,UrS=UrS,
                 offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=2,
                 control=control,gamma=gamma,scale=scale,printWarn=FALSE,start=start,
                 mustart=mustart,scoreType=scoreType,null.coef=null.coef,
                 pearson.extra=pearson.extra,dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)
      mustart <- b$fitted.values 
      etastart <- b$linear.predictors
      start <- b$coefficients
      old.score <- score;lsp <- lsp1
      score <- b[[sname]]; grad <- b[[sname1]]; hess <- b[[sname2]]  

      grad <- t(L)%*%grad
      hess <- t(L)%*%hess%*%L
      if (!is.null(lsp.max)) { ## need to transform to delta space
          delta <- delta1
          rho <- rt(delta,lsp1.max)
          nr <- length(rho$rho1)
          hess <- diag(rho$rho1,nr,nr)%*%hess%*%diag(rho$rho1,nr,nr) + diag(rho$rho2*grad)
          grad <- rho$rho1*grad
       }

    } ## end of steepest descent trial for indefinite problems

    ## record current score
    score.hist[i] <- score 
   
    ## test for convergence
    converged <- !indef ## not converged if indefinite
    if (reml) score.scale <- abs(log(b$scale.est)) + abs(score) else
    score.scale <- abs(b$scale.est) + abs(score)
    grad2 <- diag(hess)    
    uconv.ind <- (abs(grad) > score.scale*conv.tol*.1)|(abs(grad2)>score.scale*conv.tol*.1)
    if (sum(abs(grad)>score.scale*conv.tol*5)) converged <- FALSE
    if (abs(old.score-score)>score.scale*conv.tol) { 
      if (converged) uconv.ind <- uconv.ind | TRUE ## otherwise can't progress
      converged <- FALSE      
    }
    if (ii==maxHalf) converged <- TRUE ## step failure
    if (converged) break
  } ## end of iteration loop
  if (ii==maxHalf) { 
    ct <- "step failed"
    warning("Fitting terminated with step failure - check results carefully")
  } else if (i==200) { 
    ct <- "iteration limit reached"
    warning("Iteration limit reached without full convergence - check carefully")
  } else ct <- "full convergence"
  b$dVkk <- NULL
  
  if (as.logical(edge.correct)&&reml) {
    ## for those smoothing parameters that appear to be at working infinity
    ## reduce them until there is a detectable increase in RE/ML...
    flat <- which(abs(grad2) < abs(grad)*100) ## candidates for reduction
    REML <- b[[sname]]
    alpha <- if (is.logical(edge.correct)) .02 else abs(edge.correct) ## target RE/ML change per sp
    b1 <- b; lsp1 <- lsp
    if (length(flat)) {
      step <- as.numeric(initial.lsp > lsp)*2-1 ## could use sign here 
      for (i in flat) {
        REML <- b1$REML + alpha
        while (b1$REML < REML) {
          lsp1[i] <- lsp1[i] + step[i]
          b1 <- gam.fit3(x=X, y=y, sp=L%*%lsp1+lsp0,Eb=Eb,UrS=UrS,
              offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=0,
              control=control,gamma=gamma,scale=scale,printWarn=FALSE,start=start,
              mustart=mustart,scoreType=scoreType,null.coef=null.coef,
              pearson.extra=pearson.extra,dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)
        }
      }
    } ## if length(flat) 
    b1 <- gam.fit3(x=X, y=y, sp=L%*%lsp1+lsp0,Eb=Eb,UrS=UrS,
                 offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=2,
                 control=control,gamma=gamma,scale=scale,printWarn=FALSE,start=start,
                 mustart=mustart,scoreType=scoreType,null.coef=null.coef,
                 pearson.extra=pearson.extra,dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)
  
    score1 <- b1[[sname]];grad1 <- b1[[sname1]];hess1 <- b1[[sname2]] 
           
    grad1 <- t(L)%*%grad1
    hess1 <- t(L)%*%hess1%*%L
    if (!is.null(lsp.max)) { ## need to transform to delta space
               delta <- delta1
               rho <- rt(delta,lsp1.max)
               nr <- length(rho$rho1)
               hess1 <- diag(rho$rho1,nr,nr)%*%hess1%*%diag(rho$rho1,nr,nr) + diag(rho$rho2*grad1)
               grad1 <- rho$rho1*grad1
    }
    attr(hess,"edge.correct") <- TRUE
    attr(hess,"hess1") <- hess1
    attr(hess,"db.drho1") <- b1$db.drho
    attr(hess,"dw.drho1") <- b1$dw.drho
    attr(hess,"lsp1") <- lsp1
    attr(hess,"rp") <- b1$rp
  } ## if edge.correct
  ## report any warnings from inner loop if outer not converged...
  #if (!is.null(b$warn)&&length(b$warn)>0&&ct!="full convergence") for (i in 1:length(b$warn)) warning(b$warn[[i]])
  if (!is.null(b$warn)&&length(b$warn)>0) for (i in 1:length(b$warn)) warning(b$warn[[i]])
  list(score=score,lsp=lsp,lsp.full=L%*%lsp+lsp0,grad=grad,hess=hess,iter=i,
       conv =ct,score.hist = score.hist[!is.na(score.hist)],object=b)
} ## newton


bfgs <-  function(lsp,X,y,Eb,UrS,L,lsp0,offset,U1,Mp,family,weights,
                   control,gamma,scale,conv.tol=1e-6,maxNstep=3,maxSstep=2,
                   maxHalf=30,printWarn=FALSE,scoreType="GCV",start=NULL,
                   mustart = NULL,null.coef=rep(0,ncol(X)),pearson.extra=0,
                   dev.extra=0,n.true=-1,Sl=NULL,nei=NULL,...)

## BFGS optimizer to estimate smoothing parameters of models fitted by
## gam.fit3....
##
## L is the matrix such that L%*%lsp + lsp0 gives the logs of the smoothing 
## parameters actually multiplying the S[[i]]'s. sp's do not include the 
## log scale parameter here.
##
## BFGS is based on Nocedal & Wright (2006) Numerical Optimization, Springer.
## In particular the step lengths are chosen to meet the Wolfe conditions
## using their algorithms 3.5 (p60) and 3.6 (p61). On p143 they recommend a post step
## adjustment to the initial Hessian. I can't understand why one would do anything
## other than adjust so that the initial Hessian would give the step taken, and
## indeed the latter adjustment seems to give faster convergence than their 
## proposal, and is therefore implemented.
##
{ zoom <- function(lo,hi) {
  ## local function implementing Algorithm 3.6 of Nocedal & Wright
  ## (2006, p61) Numerical Optimization. Relies on R scoping rules. 
  ## alpha.lo and alpha.hi are the bracketing step lengths.
  ## This routine bisection searches for a step length that meets the
  ## Wolfe conditions. lo and hi are both objects containing fields
  ## `score', `alpha', `dscore', where `dscore' is the derivative of 
  ## the score in the current step direction, `grad' and `mustart'. 
  ## `dscore' will be NULL if the gradiant has yet to be evaluated.
    for (i in 1:40) {
      trial <- list(alpha = (lo$alpha+hi$alpha)/2)
      lsp <- ilsp + step * trial$alpha
      b <- gam.fit3(x=X, y=y, sp=L%*%lsp+lsp0,Eb=Eb,UrS=UrS,
           offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=0,
           control=control,gamma=gamma,scale=scale,printWarn=FALSE,start=lo$start,
           mustart=lo$mustart,scoreType=scoreType,null.coef=null.coef,
           pearson.extra=pearson.extra,dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)

      trial$mustart <- fitted(b)
      trial$scale.est <- b$scale.est ## previously dev, but this differs from newton
      trial$start <- coef(b)
      trial$score <- b[[sname]]
      
      rm(b)  
      if (trial$score>initial$score+trial$alpha*c1*initial$dscore||trial$score>=lo$score) {
        hi <- trial ## failed Wolfe 1 - insufficient decrease - step too long
      } else { ## met Wolfe 1 so check Wolve 2 - sufficiently positive second derivative?

        b <- gam.fit3(x=X, y=y, sp=L%*%lsp+lsp0,Eb=Eb,UrS=UrS,
           offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=1,
           control=control,gamma=gamma,scale=scale,printWarn=FALSE,
           start=trial$start,mustart=trial$mustart,
           scoreType=scoreType,null.coef=null.coef,pearson.extra=pearson.extra,
           dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)

        trial$grad <- t(L)%*%b[[sname1]];
      
        trial$dVkk <- diag(t(L0) %*% b$dVkk %*% L0)
        trial$scale.est <- b$scale.est;rm(b);
        trial$dscore <- sum(step*trial$grad) ## directional derivative
        
        if (abs(trial$dscore) <= -c2*initial$dscore) return(trial) ## met Wolfe 2

        ## failed Wolfe 2 derivative not increased enough
        if (trial$dscore*(hi$alpha-lo$alpha)>=0) {
          hi <- lo }  
        lo <- trial 
      }  
    } ## end while(TRUE)
    return(NULL) ## failed
  } ## end zoom

  if (control$epsilon>conv.tol/100) control$epsilon <- conv.tol/100

  reml <- scoreType%in%c("REML","P-REML","ML","P-ML") ## REML/ML indicator

  sname <- if (reml) "REML" else scoreType ## name of score 
  sname1 <- paste(sname,"1",sep="")        ## names of its derivative

  ## sanity check L
  if (is.null(L)) L <- diag(length(lsp)) else {
    if (!inherits(L,"matrix")) stop("L must be a matrix.")
    if (nrow(L)<ncol(L)) stop("L must have at least as many rows as columns.")
    if (nrow(L)!=length(lsp0)||ncol(L)!=length(lsp)) stop("L has inconsistent dimensions.")
  }
  if (is.null(lsp0)) lsp0 <- rep(0,nrow(L))
 
  ## initial fit...

  initial.lsp <- ilsp <- lsp

  b <- gam.fit3(x=X, y=y, sp=L%*%ilsp+lsp0,Eb=Eb,UrS=UrS,
               offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=1,
               control=control,gamma=gamma,scale=scale,printWarn=FALSE,
               start=start,mustart=mustart,
               scoreType=scoreType,null.coef=null.coef,
               pearson.extra=pearson.extra,dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)

  initial <- list(alpha = 0,mustart=b$fitted.values,start=coef(b))
  score <- b[[sname]];grad <- t(L)%*%b[[sname1]];

  ## dVkk only refers to smoothing parameters, but sp may contain
  ## extra parameters at start and scale parameter at end. Have
  ## to reduce L accordingly... 
  if (!is.null(family$n.theta)&&family$n.theta>0) {
    ind <- 1:family$n.theta
    nind <- ncol(L) - family$n.theta - if (family$n.theta + nrow(b$dVkk)<nrow(L)) 1 else 0 
    spind <- if (nind>0) family$n.theta+1:nind else rep(0,0)
    rspind <- family$n.theta + 1:nrow(b$dVkk)
  } else {
    nind <- ncol(L) - if (nrow(b$dVkk)<nrow(L)) 1 else 0 
    spind <- if (nind>0) 1:nind else rep(0,0) ## index of smooth parameters
    rspind <- 1:nrow(b$dVkk)
  }  
  L0 <- L[rspind,spind] ##if (nrow(L)!=nrow(b$dVkk)) L[spind,spind] else L
  
  initial$dVkk <- diag(t(L0) %*% b$dVkk %*% L0)
  initial$score <- score;initial$grad <- grad;
  initial$scale.est <- b$scale.est

  if (reml) score.scale <- 1 + abs(initial$score) 
            else score.scale <- abs(initial$scale.est) + abs(initial$score)   
  
  start0 <- coef(b)
  mustart0 <- fitted(b)
  rm(b)

  B <- diag(length(initial$grad)) ## initial Hessian
  feps <- 1e-4;fdgrad <- grad*0
  for (i in 1:length(lsp)) { ## loop to FD for Hessian
     ilsp <- lsp;ilsp[i] <- ilsp[i] + feps 
     b <- gam.fit3(x=X, y=y, sp=L%*%ilsp+lsp0,Eb=Eb,UrS=UrS,
               offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=1,
               control=control,gamma=gamma,scale=scale,printWarn=FALSE,
               start=start0,mustart=mustart0,
               scoreType=scoreType,null.coef=null.coef,
               pearson.extra=pearson.extra,dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...) 
     grad1 <- t(L)%*%b[[sname1]];
     fdgrad[i] <- (b[[sname]]-score)/feps ## get FD grad for free - useful for debug checks
     B[i,] <- (grad1-grad)/feps 
     rm(b)
  } ## end of FD Hessian loop
  ## force initial Hessian to +ve def and invert... 
  B <- (B+t(B))/2
  eb <- eigen(B,symmetric=TRUE)
  eb$values <- abs(eb$values)
  thresh <- max(eb$values) * 1e-4
  eb$values[eb$values<thresh] <- thresh
  B <- eb$vectors%*%(t(eb$vectors)/eb$values)
  ilsp <- lsp
  max.step <- 200

  c1 <- 1e-4;c2 <- .9 ## Wolfe condition constants

  score.hist <- rep(NA,max.step+1)
  score.hist[1] <- initial$score

  check.derivs <- FALSE;eps <- 1e-5

  uconv.ind <- rep(TRUE,ncol(B))
  rolled.back <- FALSE 

  for (i in 1:max.step) { ## the main BFGS loop
   
    ## get the trial step ...
    step <- initial$grad*0
    step[uconv.ind] <- -B[uconv.ind,uconv.ind]%*%initial$grad[uconv.ind]

    ## following tends to have lower directional grad than above (or full version commented out below)
    #step <- -drop(B%*%initial$grad)
    ## following line would mess up conditions under which Wolfe guarantees update,
    ## *if* based only on grad and not grad and hess...  
    #step[!uconv.ind] <- 0 ## don't move if apparently converged 
    
    if (sum(step*initial$grad)>=0) { ## step not descending!
      ## Following would really be in the positive definite space... 
      ##step[uconv.ind] <- -solve(chol2inv(chol(B))[uconv.ind,uconv.ind],initial$grad[uconv.ind])
      step <- -diag(B)*initial$grad ## simple scaled steepest descent 
      step[!uconv.ind] <- 0 ## don't move if apparently converged 
    }

    ms <- max(abs(step))
    trial <- list()
    if (ms>maxNstep) { 
      trial$alpha <- maxNstep/ms
      alpha.max <- trial$alpha*1.05
      ## step <- maxNstep * step/ms
      #alpha.max <- 1 ## was 50 in place of 1 here and below
    } else {
      trial$alpha <- 1 
      alpha.max <- min(2,maxNstep/ms) ## 1*maxNstep/ms
    }
    initial$dscore <- sum(step*initial$grad)
    prev <- initial

    deriv <- 1 ## only get derivatives immediately for initial step length   
    while(TRUE) { ## step length control Alg 3.5 of N&W (2006, p60)
      lsp <- ilsp + trial$alpha*step
      b <- gam.fit3(x=X, y=y, sp=L%*%lsp+lsp0,Eb=Eb,UrS=UrS,
                    offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=deriv,
                    control=control,gamma=gamma,scale=scale,printWarn=FALSE,start=prev$start,
                    mustart=prev$mustart,scoreType=scoreType,null.coef=null.coef,
                    pearson.extra=pearson.extra,dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)
     
     ### Derivative testing code. Not usually called and not part of BFGS...
     ok <- check.derivs
     while (ok) { ## derivative testing
       #deriv <- 1
       ok <- FALSE ## set to TRUE to re-run (e.g. with different eps)
       deriv.check(x=X, y=y, sp=L%*%lsp+lsp0, Eb=Eb,UrS=UrS,
         offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=1,
         control=control,gamma=gamma,scale=scale,
         printWarn=FALSE,mustart=prev$mustart,start=prev$start,
         scoreType=scoreType,eps=eps,null.coef=null.coef,Sl=Sl,nei=nei,...)
       ## deal with fact that deriv might be 0...	 
       bb <- if (deriv==1) b else gam.fit3(x=X, y=y, sp=L%*%lsp+lsp0,Eb=Eb,UrS=UrS,
                    offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=1,
                    control=control,gamma=gamma,scale=scale,printWarn=FALSE,start=prev$start,
                    mustart=prev$mustart,scoreType=scoreType,null.coef=null.coef,
                    pearson.extra=pearson.extra,dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)
       #fdH <- bb$dH
       fdb.dr <- bb$db.drho*0
       if (!is.null(bb$NCV)) {
         deta.cv <-  attr(bb$NCV,"deta.cv")
	 fd.eta <- deta.cv*0
       }	 
       for (j in 1:ncol(fdb.dr)) { ## check dH and db.drho
         lsp1 <- lsp;lsp1[j] <- lsp[j] + eps
         ba <- gam.fit3(x=X, y=y, sp=L%*%lsp1+lsp0,Eb=Eb,UrS=UrS,
                    offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=1,
                    control=control,gamma=gamma,scale=scale,printWarn=FALSE,start=prev$start,
                    mustart=prev$mustart,scoreType=scoreType,null.coef=null.coef,
                    pearson.extra=pearson.extra,dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)
        # fdH[[j]] <- (ba$H - bb$H)/eps
         fdb.dr[,j] <- (ba$coefficients - bb$coefficients)/eps
	 if (!is.null(bb$NCV)) fd.eta[,j] <- as.numeric(attr(ba$NCV,"eta.cv")-attr(bb$NCV,"eta.cv"))/eps
       }
     } 
     ### end of derivative testing. BFGS code resumes...
      trial$score <- b[[sname]];
  

      if (deriv>0) {
        trial$grad <- t(L)%*%b[[sname1]];
       
        trial$dVkk <- diag(t(L0) %*% b$dVkk %*% L0) ## curvature testing matrix
        trial$dscore <- sum(trial$grad*step)
        deriv <- 0 
      } else trial$grad <- trial$dscore <- NULL
      trial$mustart <- b$fitted.values
      trial$start <- b$coefficients
      trial$scale.est <- b$scale.est
      
      rm(b)
      Wolfe2 <- TRUE
      ## check the first Wolfe condition (sufficient decrease)...
      if (trial$score>initial$score+c1*trial$alpha*initial$dscore||(deriv==0&&trial$score>=prev$score)) {
         trial <- zoom(prev,trial) ## Wolfe 1 not met so backtracking
         break
      } 

      if (is.null(trial$dscore)) { ## getting gradients
        b <- gam.fit3(x=X, y=y, sp=L%*%lsp+lsp0,Eb=Eb,UrS=UrS,
                      offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=1,
                      control=control,gamma=gamma,scale=scale,printWarn=FALSE,
                      start=trial$start,mustart=trial$mustart,
                      scoreType=scoreType,null.coef=null.coef,pearson.extra=pearson.extra,
                      dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)
        trial$grad <- t(L)%*%b[[sname1]];
       
        trial$dscore <- sum(trial$grad*step)
        trial$scale.est <- b$scale.est
        trial$dVkk <- diag(t(L0) %*% b$dVkk %*% L0) ## curvature testing matrix
        rm(b)
      }
      
      ## Note that written this way so that we can pass on to next test when appropriate...
     
      if (abs(trial$dscore) <= -c2*initial$dscore) break; ## `trial' is ok. (2nd Wolfe condition met).
      Wolfe2 <- FALSE

      if (trial$dscore>=0) { ## increase at end of trial step
        trial <- zoom(trial,prev)
        Wolfe2 <- if (is.null(trial)) FALSE else TRUE
        break
      }
      
      prev <- trial
      if (trial$alpha == alpha.max) break ## { trial <- NULL;break;} ## step failed
      trial <- list(alpha = min(prev$alpha*1.3, alpha.max)) ## increase trial step to try to meet Wolfe 2
    } ## end of while(TRUE)

    ## Now `trial' contains a suitable step, or is NULL on complete failure to meet Wolfe,
    ## or contains a step that fails to meet Wolfe2, so that B can not be updated  
    if (is.null(trial)) { ## step failed
      lsp <- ilsp
      if (rolled.back) break ## failed to move, so nothing more can be done.
      ## check for working infinite smoothing params... 
      uconv.ind <- abs(initial$grad) > score.scale*conv.tol*.1 
      uconv.ind[spind] <- uconv.ind[spind] | abs(initial$dVkk) > score.scale * conv.tol*.1
      if (sum(!uconv.ind)==0) break ## nothing to move back so nothing more can be done.
      trial <- initial ## reset to allow roll back
      converged <- TRUE ## only to signal that roll back should be tried
    } else { ## update the Hessian etc...
     
      yg <- trial$grad-initial$grad
      step <- step*trial$alpha
      rho <- sum(yg*step)
      if (rho>0) { #Wolfe2) { ## only update if Wolfe2 is met, otherwise B can fail to be +ve def.
        if (i==1) { ## initial step --- adjust Hessian as p143 of N&W
          B <- B * trial$alpha ## this is my version 
          ## B <- B * sum(yg*step)/sum(yg*yg) ## this is N&W
        }
        rho <- 1/rho # sum(yg*step)
        B <- B - rho*step%*%(t(yg)%*%B)

        ## Note that Wolfe 2 guarantees that rho>0 and updated B is 
        ## +ve definite (left as an exercise for the reader)...
        B <- B - rho*(B%*%yg)%*%t(step) + rho*step%*%t(step)
      }

      score.hist[i+1] <- trial$score

      lsp <- ilsp <- ilsp + step 

      ## test for convergence
      converged <- TRUE
      if (reml) score.scale <- 1 + abs(trial$score) ## abs(log(trial$dev/nrow(X))) + abs(trial$score)
      else score.scale <- abs(trial$scale.est) + abs(trial$score)  ##trial$dev/nrow(X) + abs(trial$score)    
      uconv.ind <- abs(trial$grad) > score.scale*conv.tol 
      if (sum(uconv.ind)) converged <- FALSE
      #if (length(uconv.ind)>length(trial$dVkk)) trial$dVkk <- c(trial$dVkk,score.scale)
      ## following must be tighter than convergence...
      uconv.ind <- abs(trial$grad) > score.scale*conv.tol*.1 
      uconv.ind[spind] <- uconv.ind[spind] | abs(trial$dVkk) > score.scale * conv.tol*.1 
      if (abs(initial$score-trial$score) > score.scale*conv.tol) { 
        if (!sum(uconv.ind)) uconv.ind <- uconv.ind | TRUE ## otherwise can't progress
        converged <- FALSE      
      }
    }
    ## roll back any `infinite' smoothing parameters to the point at
    ## which score carries some information about them and continue 
    ## optimization. Guards against early long steps missing shallow minimum. 
    if (converged) { ## try roll back for `working inf' sps...
        if (sum(!uconv.ind)==0||rolled.back) break
        rolled.back <- TRUE
        counter <- 0
        uconv.ind0 <- uconv.ind 
        while (sum(!uconv.ind0)>0&&counter<5) {
          ## shrink towards initial values...
          lsp[!uconv.ind0] <- lsp[!uconv.ind0]*.8 + initial.lsp[!uconv.ind0]*.2
          b <- gam.fit3(x=X, y=y, sp=L%*%lsp+lsp0,Eb=Eb,UrS=UrS,
                      offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=1,
                      control=control,gamma=gamma,scale=scale,printWarn=FALSE,
                      start=trial$start,mustart=trial$mustart,
                      scoreType=scoreType,null.coef=null.coef,pearson.extra=pearson.extra,
                      dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)
          trial$score <- b[[sname]]
          trial$grad <- t(L)%*%b[[sname1]];

          trial$dscore <- sum(trial$grad*step)
          trial$scale.est <- b$scale.est
          trial$dVkk <- diag(t(L0) %*% b$dVkk %*% L0) ## curvature testing matrix 
          #if (length(uconv.ind)>length(trial$dVkk)) trial$dVkk <- c(trial$dVkk,score.scale)
          rm(b);counter <- counter + 1
          ## note that following rolls back until there is clear signal in derivs...
          uconv.ind0 <- abs(trial$grad) > score.scale*conv.tol*20        
          uconv.ind0[spind] <- uconv.ind0[spind] |  abs(trial$dVkk) > score.scale * conv.tol * 20
          uconv.ind0 <- uconv.ind0 | uconv.ind ## make sure we don't start rolling back unproblematic sps 
        }
        uconv.ind <- uconv.ind | TRUE
        ## following line is tempting, but will likely reduce usefullness of B as approximtion 
        ## to inverse Hessian on return...
        ##B <- diag(diag(B),nrow=nrow(B))
        ilsp <- lsp
    }
    
    initial <- trial
    initial$alpha <- 0
      
  } ## end of iteration loop


  if (is.null(trial)) { 
    ct <- "step failed"
    lsp <- ilsp
    trial <- initial
  }
  else if (i==max.step) ct <- "iteration limit reached" 
  else ct <- "full convergence"
  ## final fit
  b <- gam.fit3(x=X, y=y, sp=L%*%lsp+lsp0,Eb=Eb,UrS=UrS,
                offset = offset,U1=U1,Mp=Mp,family = family,weights=weights,deriv=1,
                control=control,gamma=gamma,scale=scale,printWarn=FALSE,
                start=trial$start,mustart=trial$mustart,
                scoreType=scoreType,null.coef=null.coef,pearson.extra=pearson.extra,
                dev.extra=dev.extra,n.true=n.true,Sl=Sl,nei=nei,...)
  score <- b[[sname]];grad <- t(L)%*%b[[sname1]];

  if (!is.null(b$Vg)) {
    M <- ncol(b$db.drho)
    b$Vg <- (B%*%t(L)%*%b$Vg%*%L%*%B)[1:M,1:M] ## sandwich estimate of 
    db.drho <- b$db.drho%*%L[1:M,1:M,drop=FALSE]
    b$Vc <- db.drho %*% b$Vg %*% t(db.drho) ## correction term for cov matrices
  }
  b$dVkk <- NULL
  ## get approximate Hessian...
  ev <- eigen(B,symmetric=TRUE)
  ind <- ev$values>max(ev$values)*.Machine$double.eps^.9
  ev$values[ind] <- 1/ev$values[ind]
  ev$values[!ind] <- 0
  B <- ev$vectors %*% (ev$values*t(ev$vectors))
  #if (!is.null(b$warn)&&length(b$warn)>0&&ct!="full convergence") for (j in 1:length(b$warn)) warning(b$warn[[j]])
  if (!is.null(b$warn)&&length(b$warn)>0) for (j in 1:length(b$warn)) warning(b$warn[[j]])
  list(score=score,lsp=lsp,lsp.full=L%*%lsp+lsp0,grad=grad,hess=B,iter=i,conv =ct,
       score.hist=score.hist[!is.na(score.hist)],object=b)
} ## end of bfgs


gam2derivative <- function(lsp,args,...)
## Performs IRLS GAM fitting for smoothing parameters given in lsp 
## and returns the derivatives of the GCV or UBRE score w.r.t the 
## smoothing parameters for the model.
## args is a list containing the arguments for gam.fit3
## For use as optim() objective gradient
{ reml <- args$scoreType%in%c("REML","P-REML","ML","P-ML") ## REML/ML indicator
  sname <- if (reml) "REML" else args$scoreType
  sname1 <- paste(sname,"1",sep=""); 
  if (!is.null(args$L)) {
    lsp <- args$L%*%lsp + args$lsp0
  }
  b<-gam.fit3(x=args$X, y=args$y, sp=lsp,Eb=args$Eb,UrS=args$UrS,
     offset = args$offset,U1=args$U1,Mp=args$Mp,family = args$family,weights=args$w,deriv=1,
     control=args$control,gamma=args$gamma,scale=args$scale,scoreType=args$scoreType,
     null.coef=args$null.coef,n.true=args$n.true,Sl=args$Sl,nei=args$nei,...)
  ret <- b[[sname1]]
  if (!is.null(args$L)) ret <- t(args$L)%*%ret
  ret
} ## gam2derivative

gam2objective <- function(lsp,args,...)
## Performs IRLS GAM fitting for smoothing parameters given in lsp 
## and returns the GCV or UBRE score for the model.
## args is a list containing the arguments for gam.fit3
## For use as optim() objective
{ reml <- args$scoreType%in%c("REML","P-REML","ML","P-ML") ## REML/ML indicator
  sname <- if (reml) "REML" else args$scoreType
  if (!is.null(args$L)) {
    lsp <- args$L%*%lsp + args$lsp0
  }
  b<-gam.fit3(x=args$X, y=args$y, sp=lsp,Eb=args$Eb,UrS=args$UrS,
     offset = args$offset,U1=args$U1,Mp=args$Mp,family = args$family,weights=args$w,deriv=0,
     control=args$control,gamma=args$gamma,scale=args$scale,scoreType=args$scoreType,
     null.coef=args$null.coef,n.true=args$n.true,Sl=args$Sl,start=args$start,nei=args$nei,...)
  ret <- b[[sname]]
  attr(ret,"full.fit") <- b
  ret
} ## gam2objective


gam4objective <- function(lsp,args,...)
## Performs IRLS GAM fitting for smoothing parameters given in lsp 
## and returns the GCV or UBRE score for the model.
## args is a list containing the arguments for gam.fit3
## For use as nlm() objective
{ reml <- args$scoreType%in%c("REML","P-REML","ML","P-ML") ## REML/ML indicator
  sname <- if (reml) "REML" else args$scoreType
  sname1 <- paste(sname,"1",sep=""); 
  if (!is.null(args$L)) {
    lsp <- args$L%*%lsp + args$lsp0
  }
  b<-gam.fit3(x=args$X, y=args$y, sp=lsp, Eb=args$Eb,UrS=args$UrS,
     offset = args$offset,U1=args$U1,Mp=args$Mp,family = args$family,weights=args$w,deriv=1,
     control=args$control,gamma=args$gamma,scale=args$scale,scoreType=args$scoreType,
     null.coef=args$null.coef,Sl=args$Sl,start=args$start,nei=args$nei,...)
  ret <- b[[sname]]
  at <- b[[sname1]]

  attr(ret,"full.fit") <- b

  if (!is.null(args$L)) at <- t(args$L)%*%at

  attr(ret,"gradient") <- at
  ret
} ## gam4objective

##
## The following fix up family objects for use with gam.fit3
##

fix.family.link.general.family <- function(fam) fix.family.link.family(fam)

fix.family.link.extended.family <- function(fam) {
## extended families require link derivatives in ratio form.
## g2g= g''/g'^2, g3g = g'''/g'^3, g4g = g''''/g'^4 - these quanitities are often
## less overflow prone than the raw derivatives
  if (!is.null(fam$g2g)&&!is.null(fam$g3g)&&!is.null(fam$g4g)) return(fam)  
  link <- fam$link
  if (link=="identity") {
    fam$g2g <- fam$g3g <- fam$g4g <- 
    function(mu) rep.int(0,length(mu))
  } else if (link == "log") {
    fam$g2g <- function(mu) rep(-1,length(mu))
    fam$g3g <- function(mu) rep(2,length(mu))
    fam$g4g <- function(mu) rep(-6,length(mu))
  } else if (link == "inverse") {
    ## g'(mu) = -1/mu^2
    fam$g2g <- function(mu) 2*mu     ## g'' = 2/mu^3
    fam$g3g <- function(mu) 6*mu^2   ## g''' = -6/mu^4
    fam$g4g <- function(mu) 24*mu^3     ## g'''' = 24/mu^5
  } else if (link == "logit") {
    ## g = log(mu/(1-mu)) g' = 1/(1-mu) + 1/mu = 1/(mu*(1-mu))
    fam$g2g <- function(mu) mu^2 - (1-mu)^2      ## g'' = 1/(1 - mu)^2 - 1/mu^2
    fam$g3g <- function(mu) 2*mu^3 + 2*(1-mu)^3  ## g''' = 2/(1 - mu)^3 + 2/mu^3
    fam$g4g <- function(mu) 6*mu^4 - 6*(1-mu)^4  ## g'''' = 6/(1-mu)^4 - 6/mu^4
  } else if (link == "sqrt") {
  ## g = sqrt(mu); g' = .5*mu^-.5
    fam$g2g <- function(mu) - mu^-.5  ## g'' = -.25 * mu^-1.5
    fam$g3g <- function(mu) 3 * mu^-1 ## g''' = .375 * mu^-2.5
    fam$g4g <- function(mu) -15 * mu^-1.5 ## -0.9375 * mu^-3.5
  } else if (link == "probit") {
  ## g(mu) = qnorm(mu); 1/g' = dmu/deta = 1/dnorm(eta)
    fam$g2g <- function(mu) { 
      #eta <- fam$linkfun(mu)
      eta <- qnorm(mu)
      ## g'' = eta/fam$mu.eta(eta)^2
      eta
    }
    fam$g3g <- function(mu) {
      #eta <-  fam$linkfun(mu)
      eta <- qnorm(mu)
      ## g''' = (1 + 2*eta^2)/fam$mu.eta(eta)^3
      (1 + 2*eta^2)
    }
    fam$g4g <- function(mu) {
       #eta <-  fam$linkfun(mu)
       eta <- qnorm(mu)
       ## g'''' = (7*eta + 6*eta^3)/fam$mu.eta(eta)^4
       (7*eta + 6*eta^3)
    }
  } else if (link == "cauchit") {
  ## uses general result that if link is a quantile function then 
  ## d mu / d eta = f(eta) where f is the density. Link derivative
  ## is one over this... repeated differentiation w.r.t. mu using chain
  ## rule gives results...
    fam$g2g <- function(mu) { 
     #eta <- fam$linkfun(mu)
     eta <- qcauchy(mu)
     ## g'' = 2*pi*pi*eta*(1+eta*eta)
     eta/(1+eta*eta)
    }
    fam$g3g <- function(mu) { 
     #eta <- fam$linkfun(mu)
     eta <- qcauchy(mu)
     eta2 <- eta*eta
     ## g''' = 2*pi*pi*pi*(1+3*eta2)*(1+eta2)
     (1+3*eta2)/(1+eta2)^2
    }
    fam$g4g <- function(mu) { 
     #eta <- fam$linkfun(mu)
     eta <- qcauchy(mu)
     eta2 <- eta*eta
     ## g'''' = 2*pi^4*(8*eta+12*eta2*eta)*(1+eta2)
     ((8+ 12*eta2)/(1+eta2)^2)*(eta/(1+eta2))
    }
  } else if (link == "cloglog") {
    ## g = log(-log(1-mu)), g' = -1/(log(1-mu)*(1-mu))
    fam$g2g <- function(mu) { l1m <- log1p(-mu)
      -l1m - 1
    }
    fam$g3g <- function(mu) { l1m <- log1p(-mu)
       l1m*(2*l1m + 3) + 2    
    }
    fam$g4g <- function(mu){
      l1m <- log1p(-mu)
      -l1m*(l1m*(6*l1m+11)+12)-6
    }
  } else stop("link not implemented for extended families")
  ## avoid storing the calling environment of fix.family.link... 
  environment(fam$g2g) <-  environment(fam$g3g) <-  environment(fam$g4g) <- environment(fam$linkfun)
  return(fam)
} ## fix.family.link.extended.family

fix.family.link.family <- function(fam)
# adds d2link the second derivative of the link function w.r.t. mu
# to the family supplied, as well as a 3rd derivative function 
# d3link...
# All d2link and d3link functions have been checked numerically. 
{ if (!inherits(fam,"family")) stop("fam not a family object")
  if (is.null(fam$canonical)) { ## note the canonical link - saves effort in full Newton
    if (fam$family=="gaussian") fam$canonical <- "identity" else
    if (fam$family=="poisson"||fam$family=="quasipoisson") fam$canonical <- "log" else
    if (fam$family=="binomial"||fam$family=="quasibinomial") fam$canonical <- "logit" else
    if (fam$family=="Gamma") fam$canonical <- "inverse" else
    if (fam$family=="inverse.gaussian") fam$canonical <- "1/mu^2" else
    fam$canonical <- "none"
  }
  if (!is.null(fam$d2link)&&!is.null(fam$d3link)&&!is.null(fam$d4link)) return(fam) 
  link <- fam$link
  if (length(link)>1) {
    if (fam$family=="quasi") # then it's a power link
    { lambda <- log(fam$linkfun(exp(1))) ## the power, if > 0
      if (lambda<=0) { fam$d2link <- function(mu) -1/mu^2
        fam$d3link <- function(mu) 2/mu^3
        fam$d4link <- function(mu) -6/mu^4
      } else { fam$d2link <- function(mu) lambda*(lambda-1)*mu^(lambda-2)
        fam$d3link <- function(mu) (lambda-2)*(lambda-1)*lambda*mu^(lambda-3)
        fam$d4link <- function(mu) (lambda-3)*(lambda-2)*(lambda-1)*lambda*mu^(lambda-4)
      }
    } else stop("unrecognized (vector?) link")
  } else if (link=="identity") {
    fam$d4link <- fam$d3link <- fam$d2link <- 
    function(mu) rep.int(0,length(mu))
  } else if (link == "log") {
    fam$d2link <- function(mu) -1/mu^2
    fam$d3link <- function(mu) 2/mu^3
    fam$d4link <- function(mu) -6/mu^4
  } else if (link == "inverse") {
    fam$d2link <- function(mu) 2/mu^3
    fam$d3link <- function(mu) { mu <- mu*mu;-6/(mu*mu)}
    fam$d4link <- function(mu) { mu2 <- mu*mu;24/(mu2*mu2*mu)}
  } else if (link == "logit") {
    fam$d2link <- function(mu) 1/(1 - mu)^2 - 1/mu^2
    fam$d3link <- function(mu) 2/(1 - mu)^3 + 2/mu^3
    fam$d4link <- function(mu) 6/(1-mu)^4 - 6/mu^4
  } else if (link == "probit") {
    fam$d2link <- function(mu) { 
      #eta <- fam$linkfun(mu)
      eta <- qnorm(mu)
      #eta/fam$mu.eta(eta)^2
      eta/pmax(dnorm(eta), .Machine$double.eps)^2
    }
    fam$d3link <- function(mu) {
      #eta <-  fam$linkfun(mu)
      eta <- qnorm(mu)
      #(1 + 2*eta^2)/fam$mu.eta(eta)^3
      (1 + 2*eta^2)/pmax(dnorm(eta), .Machine$double.eps)^3
    }
    fam$d4link <- function(mu) {
       #eta <-  fam$linkfun(mu)
       eta <- qnorm(mu)
       #(7*eta + 6*eta^3)/fam$mu.eta(eta)^4
       (7*eta + 6*eta^3)/pmax(dnorm(eta), .Machine$double.eps)^4
    }
  } else if (link == "cloglog") {
  ## g = log(-log(1-mu)), g' = -1/(log(1-mu)*(1-mu))
    fam$d2link <- function(mu) { l1m <- log1p(-mu)
      -1/((1 - mu)^2*l1m) *(1+ 1/l1m)
    }
    fam$d3link <- function(mu) { l1m <- log1p(-mu)
       mu3 <- (1-mu)^3
      (-2 - 3*l1m - 2*l1m^2)/mu3/l1m^3
    }
    fam$d4link <- function(mu){
      l1m <- log1p(-mu)
      mu4 <- (1-mu)^4
      ( - 12 - 11 * l1m - 6 * l1m^2 - 6/l1m )/mu4  /l1m^3
    }
  } else if (link == "sqrt") {
    fam$d2link <- function(mu) -.25 * mu^-1.5
    fam$d3link <- function(mu) .375 * mu^-2.5
    fam$d4link <- function(mu) -0.9375 * mu^-3.5
  } else if (link == "cauchit") {
  ## uses general result that if link is a quantile function then 
  ## d mu / d eta = f(eta) where f is the density. Link derivative
  ## is one over this... repeated differentiation w.r.t. mu using chain
  ## rule gives results...
    fam$d2link <- function(mu) { 
     #eta <- fam$linkfun(mu)
     eta <- qcauchy(mu)
     2*pi*pi*eta*(1+eta*eta)
    }
    fam$d3link <- function(mu) { 
     #eta <- fam$linkfun(mu)
     eta <- qcauchy(mu)
     eta2 <- eta*eta
     2*pi*pi*pi*(1+3*eta2)*(1+eta2)
    }
    fam$d4link <- function(mu) { 
     #eta <- fam$linkfun(mu)
     eta <- qcauchy(mu)
     eta2 <- eta*eta
     2*pi^4*(8*eta+12*eta2*eta)*(1+eta2)
    }
  } else if (link == "1/mu^2") {
    fam$d2link <- function(mu) 6 * mu^-4
    fam$d3link <- function(mu) -24 * mu^-5
    fam$d4link <- function(mu) 120 * mu^-6
  } else if (substr(link,1,3)=="mu^") { ## it's a power link
    ## note that lambda <=0 gives log link so don't end up here
    lambda <- get("lambda",environment(fam$linkfun))
    fam$d2link <- function(mu) (lambda*(lambda-1)) * mu^{lambda-2}
    fam$d3link <- function(mu) (lambda*(lambda-1)*(lambda-2)) * mu^{lambda-3}
    fam$d4link <- function(mu) (lambda*(lambda-1)*(lambda-2)*(lambda-3)) * mu^{lambda-4}
  } else stop("link not recognised")
  ## avoid giant environments being stored....
  environment(fam$d2link) <-  environment(fam$d3link) <-  environment(fam$d4link) <- environment(fam$linkfun)
  return(fam)
} ## fix.family.link.family


## NOTE: something horrible can happen here. The way method dispatch works, the
## environment attached to functions created in fix.family.link is the environment
## from which fix.family.link was called - and this whole environment is stored
## with the created function - in the gam context that means the model matrix is
## stored invisibly away for no useful purpose at all. pryr:::object_size will
## show the true stored size of an object with hidden environments. But environments
## of functions created in method functions should be set explicitly to something
## harmless (see ?environment for some possibilities, empty is rarely a good idea)
## 9/2017

fix.family.link <- function(fam) UseMethod("fix.family.link")

fix.family.var <- function(fam)
# adds dvar the derivative of the variance function w.r.t. mu
# to the family supplied, as well as d2var the 2nd derivative of 
# the variance function w.r.t. the mean. (All checked numerically). 
{ if (inherits(fam,"extended.family")) return(fam)
  if (!inherits(fam,"family")) stop("fam not a family object")
  if (!is.null(fam$dvar)&&!is.null(fam$d2var)&&!is.null(fam$d3var)) return(fam) 
  family <- fam$family
  fam$scale <- -1
  if (family=="gaussian") {
    fam$d3var <- fam$d2var <- fam$dvar <- function(mu) rep.int(0,length(mu))
  } else if (family=="poisson"||family=="quasipoisson") {
    fam$dvar <- function(mu) rep.int(1,length(mu))
    fam$d3var <- fam$d2var <- function(mu) rep.int(0,length(mu))
    if (family=="poisson") fam$scale <- 1
  } else if (family=="binomial"||family=="quasibinomial") {
    fam$dvar <- function(mu) 1-2*mu
    fam$d2var <- function(mu) rep.int(-2,length(mu))
    fam$d3var <- function(mu) rep.int(0,length(mu))
    if (family=="binomial") fam$scale <- 1
  } else if (family=="Gamma") {
    fam$dvar <- function(mu) 2*mu
    fam$d2var <- function(mu) rep.int(2,length(mu))
    fam$d3var <- function(mu) rep.int(0,length(mu))
  } else if (family=="quasi") {
    fam$dvar <- switch(fam$varfun,
       constant = function(mu) rep.int(0,length(mu)),
       "mu(1-mu)" = function(mu) 1-2*mu,
       mu = function(mu) rep.int(1,length(mu)),
       "mu^2" = function(mu) 2*mu,
       "mu^3" = function(mu) 3*mu^2           
    )
    if (is.null(fam$dvar)) stop("variance function not recognized for quasi")
    fam$d2var <- switch(fam$varfun,
       constant = function(mu) rep.int(0,length(mu)),
       "mu(1-mu)" = function(mu) rep.int(-2,length(mu)),
       mu = function(mu) rep.int(0,length(mu)),
       "mu^2" = function(mu) rep.int(2,length(mu)),
       "mu^3" = function(mu) 6*mu           
    )
    fam$d3var <- switch(fam$varfun,
       constant = function(mu) rep.int(0,length(mu)),
       "mu(1-mu)" = function(mu) rep.int(0,length(mu)),
       mu = function(mu) rep.int(0,length(mu)),
       "mu^2" = function(mu) rep.int(0,length(mu)),
       "mu^3" = function(mu) rep.int(6,length(mu))           
    )
  } else if (family=="inverse.gaussian") {
    fam$dvar <- function(mu) 3*mu^2
    fam$d2var <- function(mu) 6*mu
    fam$d3var <- function(mu) rep.int(6,length(mu)) 
  } else stop("family not recognised")
  environment(fam$dvar) <-  environment(fam$d2var) <-  environment(fam$d3var) <- environment(fam$linkfun)
  return(fam)
} ## fix.family.var


fix.family.ls <- function(fam)
# adds ls the log saturated likelihood and its derivatives
# w.r.t. the scale parameter to the family object.
{ if (!inherits(fam,"family")) stop("fam not a family object")
  if (!is.null(fam$ls)) return(fam) 
  family <- fam$family
  if (family=="gaussian") {
    fam$ls <- function(y,w,n,scale) { 
      nobs <- sum(w>0)
      c(-nobs*log(2*pi*scale)/2 + sum(log(w[w>0]))/2,-nobs/(2*scale),nobs/(2*scale*scale))
    }
  } else if (family=="poisson") {
    fam$ls <- function(y,w,n,scale) {
      res <- rep(0,3)
      res[1] <- sum(dpois(y,y,log=TRUE)*w)
      res
    }
  } else if (family=="binomial") {
    fam$ls <- function(y,w,n,scale) { 
      c(-binomial()$aic(y,n,y,w,0)/2,0,0)
    }
  } else if (family=="Gamma") {
    fam$ls <- function(y,w,n,scale) {
      res <- rep(0,3)
      y <- y[w>0];w <- w[w>0]
      scale <- scale/w
      k <- -lgamma(1/scale) - log(scale)/scale - 1/scale 
      res[1] <- sum(k-log(y))
      k <- (digamma(1/scale)+log(scale))/(scale*scale)
      res[2] <- sum(k/w)
      k <- (-trigamma(1/scale)/(scale) + (1-2*log(scale)-2*digamma(1/scale)))/(scale^3)
      res[3] <- sum(k/w^2) 
      res
    }
  } else if (family=="quasi"||family=="quasipoisson"||family=="quasibinomial") {
    ## fam$ls <- function(y,w,n,scale) rep(0,3)
    ## Uses extended quasi-likelihood form...
    fam$ls <- function(y,w,n,scale) { 
      nobs <- sum(w>0)
      c(-nobs*log(scale)/2 + sum(log(w[w>0]))/2,-nobs/(2*scale),nobs/(2*scale*scale))
    }
  } else if (family=="inverse.gaussian") {
    fam$ls <- function(y,w,n,scale) {
      ii <- w>0
      nobs <- sum(ii)
      c(-sum(log(2*pi*scale*y[ii]^3))/2 + sum(log(w[ii]))/2,-nobs/(2*scale),nobs/(2*scale*scale))
      ## c(-sum(w*log(2*pi*scale*y^3))/2,-sum(w)/(2*scale),sum(w)/(2*scale*scale))
    }
  } else stop("family not recognised")
  environment(fam$ls) <- environment(fam$linkfun)
  return(fam)
} ## fix.family.ls

fix.family <- function(fam) {
## allows families to be patched...
   if (fam$family[1]=="gaussian") { ## sensible starting values given link...
     fam$initialize <- expression({
     n <- rep.int(1, nobs)
     if (family$link == "inverse") mustart <- y + (y==0)*sd(y)*.01 else
     if (family$link == "log") mustart <- pmax(y,.01*sd(y)) else
     mustart <- y
     })
  }
  fam
} ## fix.family


negbin <- function (theta = stop("'theta' must be specified"), link = "log") { 
## modified from Venables and Ripley's MASS library to work with gam.fit3,
## and to allow a range of `theta' values to be specified
## single `theta' to specify fixed value; 2 theta values (first smaller than second)
## are limits within which to search for theta; otherwise supplied values make up 
## search set.
## Note: to avoid warnings, get(".Theta")[1] is used below. Otherwise the initialization
##       call to negbin can generate warnings since get(".Theta") returns a vector
##       during initialization (only).
  linktemp <- substitute(link)
  if (!is.character(linktemp)) linktemp <- deparse(linktemp)
  if (linktemp %in% c("log", "identity", "sqrt")) stats <- make.link(linktemp)
  else if (is.character(link)) {
    stats <- make.link(link)
    linktemp <- link
  } else {
    if (inherits(link, "link-glm")) {
       stats <- link
            if (!is.null(stats$name))
                linktemp <- stats$name
        }
        else stop(gettextf("%s link not available for negative binomial family; available links are \"identity\", \"log\" and \"sqrt\"",linktemp))
    }
    env <- new.env(parent = .GlobalEnv)
    assign(".Theta", theta, envir = env)
    variance <- function(mu) mu + mu^2/get(".Theta")[1]
    ## dvaraince/dmu needed as well
    dvar <- function(mu) 1 + 2*mu/get(".Theta")[1]
    ## d2variance/dmu...
    d2var <- function(mu) rep(2/get(".Theta")[1],length(mu))
    d3var <- function(mu) rep(0,length(mu))
    getTheta <- function() get(".Theta")
    validmu <- function(mu) all(mu > 0)

    dev.resids <- function(y, mu, wt) { Theta <- get(".Theta")[1]
      2 * wt * (y * log(pmax(1, y)/mu) - 
        (y + Theta) * log((y + Theta)/(mu + Theta))) 
    }
    aic <- function(y, n, mu, wt, dev) {
        Theta <- get(".Theta")[1]
        term <- (y + Theta) * log(mu + Theta) - y * log(mu) +
            lgamma(y + 1) - Theta * log(Theta) + lgamma(Theta) -
            lgamma(Theta + y)
        2 * sum(term * wt)
    }
    ls <- function(y,w,n,scale) {
       Theta <- get(".Theta")[1]
       ylogy <- y;ind <- y>0;ylogy[ind] <- y[ind]*log(y[ind])
       term <- (y + Theta) * log(y + Theta) - ylogy +
            lgamma(y + 1) - Theta * log(Theta) + lgamma(Theta) -
            lgamma(Theta + y)
       c(-sum(term*w),0,0)
    }
    initialize <- expression({
        if (any(y < 0)) stop("negative values not allowed for the negative binomial family")
        n <- rep(1, nobs)
        mustart <- y + (y == 0)/6
    })

    rd <- function(mu,wt,scale) {
      Theta <- get(".Theta")[1]
      rnbinom(n=length(mu),size=Theta,mu=mu)
    }

    qf <- function(p,mu,wt,scale) {
      Theta <- get(".Theta")[1]
      qnbinom(p,size=Theta,mu=mu)
    }
 
    environment(qf) <- environment(rd) <- environment(dvar) <- environment(d2var) <- 
    environment(d3var) <-environment(variance) <- environment(validmu) <- 
    environment(ls) <- environment(dev.resids) <- environment(aic) <- environment(getTheta) <- env
    famname <- paste("Negative Binomial(", format(round(theta,3)), ")", sep = "")
    structure(list(family = famname, link = linktemp, linkfun = stats$linkfun,
        linkinv = stats$linkinv, variance = variance,dvar=dvar,d2var=d2var,d3var=d3var, dev.resids = dev.resids,
        aic = aic, mu.eta = stats$mu.eta, initialize = initialize,ls=ls,
        validmu = validmu, valideta = stats$valideta,getTheta = getTheta,qf=qf,rd=rd,canonical=""), class = "family")
} ## negbin


totalPenalty <- function(S,H,off,theta,p)
{ if (is.null(H)) St <- matrix(0,p,p)
  else { St <- H; 
    if (ncol(H)!=p||nrow(H)!=p) stop("H has wrong dimension")
  }
  theta <- exp(theta)
  m <- length(theta)
  if (m>0) for (i in 1:m) {
    k0 <- off[i]
    k1 <- k0 + nrow(S[[i]]) - 1
    St[k0:k1,k0:k1] <- St[k0:k1,k0:k1] + S[[i]] * theta[i]
  }
  St
} ## totalPenalty

totalPenaltySpace <- function(S,H,off,p)
{ ## function to obtain (orthogonal) basis for the null space and 
  ## range space of the penalty, and obtain actual null space dimension
  ## components are roughly rescaled to avoid any dominating

  Hscale <- sqrt(sum(H*H));
  if (Hscale==0) H <- NULL ## H was all zeroes anyway!

  if (is.null(H)) St <- matrix(0,p,p)
  else { St <- H/sqrt(sum(H*H)); 
    if (ncol(H)!=p||nrow(H)!=p) stop("H has wrong dimension")
  }
  m <- length(S)
  if (m>0) for (i in 1:m) {
    k0 <- off[i]
    k1 <- k0 + nrow(S[[i]]) - 1
    St[k0:k1,k0:k1] <- St[k0:k1,k0:k1] + S[[i]]/sqrt(sum(S[[i]]*S[[i]]))
  }
  es <- eigen(St,symmetric=TRUE)
  ind <- es$values>max(es$values)*.Machine$double.eps^.66
  Y <- es$vectors[,ind,drop=FALSE]  ## range space
  Z <- es$vectors[,!ind,drop=FALSE] ## null space - ncol(Z) is null space dimension
  E <- sqrt(as.numeric(es$values[ind]))*t(Y) ## E'E = St
  list(Y=Y,Z=Z,E=E)
} ## totalPenaltySpace


mini.roots <- function(S,off,np,rank=NULL)
# function to obtain square roots, B[[i]], of S[[i]]'s having as few
# columns as possible. S[[i]]=B[[i]]%*%t(B[[i]]). np is the total number
# of parameters. S is in packed form. rank[i] is optional supplied rank 
# of S[[i]], rank[i] < 1, or rank=NULL to estimate.
{ m<-length(S)
  if (m<=0) return(list())
  B<-S
  if (is.null(rank)) rank <- rep(-1,m)
  for (i in 1:m)
  { b <- mroot(S[[i]],rank=rank[i]) 
    B[[i]] <- matrix(0,np,ncol(b))
    B[[i]][off[i]:(off[i]+nrow(b)-1),] <- b
  }
  B
}


ldTweedie0 <- function(y,mu=y,p=1.5,phi=1,rho=NA,theta=NA,a=1.001,b=1.999) {
## evaluates log Tweedie density for 1<=p<=2, using series summation of
## Dunn & Smyth (2005) Statistics and Computing 15:267-280.
## Original fixed p and phi version.

  if (!is.na(rho)&&!is.na(theta)) { ## use rho and theta and get derivs w.r.t. these
    if (length(rho)>1||length(theta)>1) stop("only scalar `rho' and `theta' allowed.")
    if (a>=b||a<=1||b>=2) stop("1<a<b<2 (strict) required")
    work.param <- TRUE
    th <- theta;phi <- exp(rho)
    p <- if (th>0) (b+a*exp(-th))/(1+exp(-th)) else (b*exp(th)+a)/(exp(th)+1) 
    dpth1 <- if (th>0) exp(-th)*(b-a)/(1+exp(-th))^2 else exp(th)*(b-a)/(exp(th)+1)^2
    dpth2 <- if (th>0) ((a-b)*exp(-th)+(b-a)*exp(-2*th))/(exp(-th)+1)^3 else
                   ((a-b)*exp(2*th)+(b-a)*exp(th))/(exp(th)+1)^3
  } else { ## still need working params for tweedious call...
    work.param <- FALSE 
    if (length(p)>1||length(phi)>1) stop("only scalar `p' and `phi' allowed.")
    rho <- log(phi)
    if (p>1&&p<2) {
      if (p <= a) a <- (1+p)/2
      if (p >= b) b <- (2+p)/2
      pabp <- (p-a)/(b-p)
      theta <- log((p-a)/(b-p))
      dthp1 <- (1+pabp)/(p-a)
      dthp2 <- (pabp+1)/((p-a)*(b-p)) -(pabp+1)/(p-a)^2
    }
  }

  if (p<1||p>2) stop("p must be in [1,2]")
  ld <- cbind(y,y,y);ld <- cbind(ld,ld*NA)
  if (p == 2) { ## It's Gamma
    if (sum(y<=0)) stop("y must be strictly positive for a Gamma density")
    ld[,1] <- dgamma(y, shape = 1/phi,rate = 1/(phi * mu),log=TRUE)
    ld[,2] <- (digamma(1/phi) + log(phi) - 1 + y/mu - log(y/mu))/(phi*phi)
    ld[,3] <- -2*ld[,2]/phi + (1-trigamma(1/phi)/phi)/(phi^3)
    return(ld)
  }  

  if (length(mu)==1) mu <- rep(mu,length(y))

  if (p == 1) { ## It's Poisson like
    ## ld[,1] <- dpois(x = y/phi, lambda = mu/phi,log=TRUE)
    if (all.equal(y/phi,round(y/phi))!=TRUE) stop("y must be an integer multiple of phi for Tweedie(p=1)")
    ind <- (y!=0)|(mu!=0) ## take care to deal with y log(mu) when y=mu=0
    bkt <- y*0
    bkt[ind] <- (y[ind]*log(mu[ind]/phi) - mu[ind])
    dig <- digamma(y/phi+1)
    trig <- trigamma(y/phi+1)
    ld[,1] <- bkt/phi - lgamma(y/phi+1)
    ld[,2] <- (-bkt - y + dig*y)/(phi*phi)
    ld[,3] <- (2*bkt + 3*y - 2*dig*y - trig *y*y/phi)/(phi^3)
    return(ld) 
  }

  ## .. otherwise need the full series thing....
  ## first deal with the zeros  
  
  ind <- y==0;ld[ind,] <- 0
  ind <- ind & mu>0 ## need mu condition otherwise may try to find log(0)
 
  ld[ind,1] <- -mu[ind]^(2-p)/(phi*(2-p))
  ld[ind,2] <- -ld[ind,1]/phi  ## dld/d phi 
  ld[ind,3] <- -2*ld[ind,2]/phi ## d2ld/dphi2
  ld[ind,4] <- -ld[ind,1] * (log(mu[ind]) - 1/(2-p)) ## dld/dp
  ld[ind,5] <- 2*ld[ind,4]/(2-p) + ld[ind,1]*log(mu[ind])^2 ## d2ld/dp2
  ld[ind,6] <- -ld[ind,4]/phi ## d2ld/dphidp

  if (sum(!ind)==0) return(ld)

  ## now the non-zeros
  ind <- y==0
  y <- y[!ind];mu <- mu[!ind]
  w <- w1 <- w2 <- y*0
  oo <- .C(C_tweedious,w=as.double(w),w1=as.double(w1),w2=as.double(w2),w1p=as.double(y*0),w2p=as.double(y*0),
           w2pp=as.double(y*0),y=as.double(y),eps=as.double(.Machine$double.eps^2),n=as.integer(length(y)),
           th=as.double(theta),rho=as.double(rho),a=as.double(a),b=as.double(b))
  
  if (!work.param) { ## transform working param derivatives to p/phi derivs...
    oo$w2 <- oo$w2/phi^2 - oo$w1/phi^2
    oo$w1 <- oo$w1/phi
    oo$w2p <- oo$w2p*dthp1^2 + dthp2 * oo$w1p
    oo$w1p <- oo$w1p*dthp1
    oo$w2pp <- oo$w2pp*dthp1/phi ## this appears to be wrong
  }


  log.mu <- log(mu)
  mu1p <- theta <- mu^(1-p)
  k.theta <- mu*theta/(2-p) ## mu^(2-p)/(2-p)
  theta <- theta/(1-p) ## mu^(1-p)/(1-p)
  l.base <-  mu1p*(y/(1-p)-mu/(2-p))/phi
  ld[!ind,1] <- l.base - log(y) ## log density
  ld[!ind,2] <- -l.base/phi  ## d log f / dphi
  ld[!ind,3] <- 2*l.base/(phi*phi)  ## d2 logf / dphi2
  x <- theta*y*(1/(1-p) - log.mu)/phi + k.theta*(log.mu-1/(2-p))/phi
  ld[!ind,4] <- x
  ld[!ind,5] <- theta * y * (log.mu^2 - 2*log.mu/(1-p) + 2/(1-p)^2)/phi -
                k.theta * (log.mu^2 - 2*log.mu/(2-p) + 2/(2-p)^2)/phi ## d2 logf / dp2
  ld[!ind,6] <- - x/phi ## d2 logf / dphi dp

  if (work.param) { ## transform derivs to derivs wrt working
    ld[,3] <- ld[,3]*phi^2 + ld[,2]*phi
    ld[,2] <- ld[,2]*phi
    ld[,5] <- ld[,5]*dpth1^2 + ld[,4]*dpth2
    ld[,4] <- ld[,4]*dpth1
    ld[,6] <- ld[,6]*dpth1*phi
  }

if (TRUE) { ## DEBUG disconnetion of a terms
  ld[!ind,1] <- ld[!ind,1] + oo$w ## log density
  ld[!ind,2] <- ld[!ind,2] + oo$w1   ## d log f / dphi
  ld[!ind,3] <- ld[!ind,3] + oo$w2 ## d2 logf / dphi2
  ld[!ind,4] <- ld[!ind,4] + oo$w1p 
  ld[!ind,5] <- ld[!ind,5] + oo$w2p  ## d2 logf / dp2
  ld[!ind,6] <- ld[!ind,6] + oo$w2pp ## d2 logf / dphi dp
} 

if (FALSE) { ## DEBUG disconnetion of density terms
  ld[!ind,1] <-  oo$w ## log density
  ld[!ind,2] <-  oo$w1   ## d log f / dphi
  ld[!ind,3] <-  oo$w2 ## d2 logf / dphi2
  ld[!ind,4] <-  oo$w1p 
  ld[!ind,5] <-  oo$w2p  ## d2 logf / dp2
  ld[!ind,6] <-  oo$w2pp ## d2 logf / dphi dp
} 

  ld
} ## ldTweedie0


ldTweedie <- function(y,mu=y,p=1.5,phi=1,rho=NA,theta=NA,a=1.001,b=1.999,all.derivs=FALSE) {
## evaluates log Tweedie density for 1<=p<=2, using series summation of
## Dunn & Smyth (2005) Statistics and Computing 15:267-280.
  n <- length(y)
  if (all(!is.na(rho))&&all(!is.na(theta))) { ## use rho and theta and get derivs w.r.t. these
    #if (length(rho)>1||length(theta)>1) stop("only scalar `rho' and `theta' allowed.")
    if (a>=b||a<=1||b>=2) stop("1<a<b<2 (strict) required")
    work.param <- TRUE
    ## should buffered code for fixed p and phi be used?
    buffer <- if (length(unique(theta))==1&&length(unique(rho))==1) TRUE else FALSE 
    theta <- th <- array(theta,dim=n);
    phi <- exp(rho)
    ind <- th > 0;dpth1 <- dpth2 <-p <- rep(0,n)
    ethi <- exp(-th[ind])
    ethni <- exp(th[!ind])
    p[ind] <- (b+a*ethi)/(1+ethi)
    p[!ind] <- (b*ethni+a)/(ethni+1)
    dpth1[ind] <- ethi*(b-a)/(1+ethi)^2
    dpth1[!ind] <- ethni*(b-a)/(ethni+1)^2
    dpth2[ind] <-((a-b)*ethi+(b-a)*ethi^2)/(ethi+1)^3
    dpth2[!ind] <- ((a-b)*ethni^2+(b-a)*ethni)/(ethni+1)^3
    #p <- if (th>0) (b+a*exp(-th))/(1+exp(-th)) else (b*exp(th)+a)/(exp(th)+1) 
    #dpth1 <- if (th>0) exp(-th)*(b-a)/(1+exp(-th))^2 else exp(th)*(b-a)/(exp(th)+1)^2
    #dpth2 <- if (th>0) ((a-b)*exp(-th)+(b-a)*exp(-2*th))/(exp(-th)+1)^3 else
    #               ((a-b)*exp(2*th)+(b-a)*exp(th))/(exp(th)+1)^3
  } else { ## still need working params for tweedious call...
    work.param <- FALSE
    if (all.derivs) warning("all.derivs only available in rho, theta parameterization")
    #if (length(p)>1||length(phi)>1) stop("only scalar `p' and `phi' allowed.")
    buffer <- if (length(unique(p))==1&&length(unique(phi))==1) TRUE else FALSE 
    rho <- log(phi)
    if (min(p)>=1&&max(p)<=2) {
      ind <- p>1&p<2
      if (sum(ind)) {
        p.ind <- p[ind]
        if (min(p.ind) <= a) a <- (1+min(p.ind))/2
        if (max(p.ind) >= b) b <- (2+max(p.ind))/2
        pabp <- theta <- dthp1 <- dthp2 <- rep(0,n)
        pabp[ind] <- (p.ind-a)/(b-p.ind)
        theta[ind] <- log((p.ind-a)/(b-p.ind))
        dthp1[ind] <- (1+pabp[ind])/(p.ind-a)
        dthp2[ind] <- (pabp[ind]+1)/((p.ind-a)*(b-p.ind)) -(pabp[ind]+1)/(p.ind-a)^2
      }
    }
  }

  if (min(p)<1||max(p)>2) stop("p must be in [1,2]")
  ld <- cbind(y,y,y);ld <- cbind(ld,ld*NA)
  if (work.param&&all.derivs) ld <- cbind(ld,ld[,1:3]*0,y*0)
  if (length(p)!=n) p <- array(p,dim=n);
  if (length(phi)!=n) phi <- array(phi,dim=n)
  if (length(mu)!=n) mu <- array(mu,dim=n)
  ind <- p == 2
  if (sum(ind)) { ## It's Gamma
    if (sum(y[ind]<=0)) stop("y must be strictly positive for a Gamma density")
    ld[ind,1] <- dgamma(y[ind], shape = 1/phi[ind],rate = 1/(phi[ind] * mu[ind]),log=TRUE)
    ld[ind,2] <- (digamma(1/phi[ind]) + log(phi[ind]) - 1 + y[ind]/mu[ind] - log(y[ind]/mu[ind]))/(phi[ind]*phi[ind])
    ld[ind,3] <- -2*ld[ind,2]/phi[ind] + (1-trigamma(1/phi[ind])/phi[ind])/(phi[ind]^3)
    #return(ld)
  }  

  ind <- p == 1
  if (sum(ind)) { ## It's Poisson like
    ## ld[,1] <- dpois(x = y/phi, lambda = mu/phi,log=TRUE)
    if (all.equal(y[ind]/phi[ind],round(y[ind]/phi[ind]))!=TRUE) stop("y must be an integer multiple of phi for Tweedie(p=1)")
    indi <- (y[ind]!=0)|(mu[ind]!=0) ## take care to deal with y log(mu) when y=mu=0
    bkt <- y[ind]*0
    bkt[indi] <- ((y[ind])[indi]*log((mu[ind]/phi[ind])[indi]) - (mu[ind])[indi])
    dig <- digamma(y[ind]/phi[ind]+1)
    trig <- trigamma(y[ind]/phi[ind]+1)
    ld[ind,1] <- bkt/phi[ind] - lgamma(y[ind]/phi[ind]+1)
    ld[ind,2] <- (-bkt - y[ind] + dig[ind]*y[ind])/(phi[ind]^2)
    ld[ind,3] <- (2*bkt + 3*y[ind] - 2*dig*y[ind] - trig * y[ind]^2/phi[ind])/(phi[ind]^3)
    #return(ld) 
  }

  ## .. otherwise need the full series thing....
  ## first deal with the zeros  
  
  ind <- y==0&p>1&p<2;ld[ind,] <- 0
  ind <- ind & mu>0 ## need mu condition otherwise may try to find log(0)
  if (sum(ind)) {
    mu.ind <- mu[ind];p.ind <- p[ind];phii <- phi[ind]
    ld[ind,1] <- -mu.ind^(2-p.ind)/(phii*(2-p.ind))
    ld[ind,2] <- -ld[ind,1]/phii  ## dld/d phi 
    ld[ind,3] <- -2*ld[ind,2]/phii ## d2ld/dphi2
    ld[ind,4] <- -ld[ind,1] * (log(mu.ind) - 1/(2-p.ind)) ## dld/dp
    ld[ind,5] <- 2*ld[ind,4]/(2-p.ind) + ld[ind,1]*log(mu.ind)^2 ## d2ld/dp2
    ld[ind,6] <- -ld[ind,4]/phii ## d2ld/dphidp
    if (work.param&&all.derivs) {
      mup <- mu.ind^p.ind
      ld[ind,7] <- -mu.ind/(mup*phii)
      ld[ind,8] <- -(1-p.ind)/(mup*phii)
      ld[ind,9] <- log(mu.ind)*mu.ind/(mup*phii)
      ld[ind,10] <- -ld[ind,7]/phii
    }
  }
  if (sum(!ind)==0) return(ld)
 
  ## now the non-zeros
  ind <- which(y>0&p>1&p<2)
  y <- y[ind];mu <- mu[ind];p<- p[ind]
  w <- w1 <- w2 <- y*0
  if (length(ind)>0) {
    if (buffer) { ## use code that can buffer expensive lgamma,digamma and trigamma evaluations...
      oo <- .C(C_tweedious,w=as.double(w),w1=as.double(w1),w2=as.double(w2),w1p=as.double(y*0),w2p=as.double(y*0),
               w2pp=as.double(y*0),y=as.double(y),eps=as.double(.Machine$double.eps^2),n=as.integer(length(y)),
               th=as.double(theta[1]),rho=as.double(rho[1]),a=as.double(a),b=as.double(b))
    } else { ## use code that is not able to buffer as p and phi variable...
      if (length(theta)!=n) theta <- array(theta,dim=n)
      if (length(rho)!=n) rho <- array(rho,dim=n)
      oo <- .C(C_tweedious2,w=as.double(w),w1=as.double(w1),w2=as.double(w2),w1p=as.double(y*0),w2p=as.double(y*0),
           w2pp=as.double(y*0),y=as.double(y),eps=as.double(.Machine$double.eps^2),n=as.integer(length(y)),
           th=as.double(theta[ind]),rho=as.double(rho[ind]),a=as.double(a),b=as.double(b))
    }
    if (oo$eps < -.5) {
      if (oo$eps < -1.5) { ## failure of series in C code
        oo$w2 <- oo$w1 <- oo$w2p <- oo$w1p <- oo$w2pp <- rep(NA,length(y)) 
      }
      else warning("Tweedie density may be unreliable - series not fully converged")
    }
    phii <- phi[ind]
    if (!work.param) { ## transform working param derivatives to p/phi derivs...
      if (length(dthp1)!=n) dthp1 <- array(dthp1,dim=n)
      if (length(dthp2)!=n) dthp2 <- array(dthp2,dim=n)
      dthp1i <- dthp1[ind]
      oo$w2 <- oo$w2/phii^2 - oo$w1/phii^2
      oo$w1 <- oo$w1/phii
      oo$w2p <- oo$w2p*dthp1i^2 + dthp2[ind] * oo$w1p
      oo$w1p <- oo$w1p*dthp1i
      oo$w2pp <- oo$w2pp*dthp1i/phii 
    }


    log.mu <- log(mu)
    onep <- 1-p
    twop <- 2-p
    mu1p <- theta <- mu^onep
    k.theta <- mu*theta/twop ## mu^(2-p)/(2-p)
    theta <- theta/onep ## mu^(1-p)/(1-p)
    a1 <- (y/onep-mu/twop)
    l.base <-  mu1p*a1/phii
    ld[ind,1] <- l.base - log(y) ## log density
    ld[ind,2] <- -l.base/phii  ## d log f / dphi
    ld[ind,3] <- 2*l.base/(phii^2)  ## d2 logf / dphi2
    x <- theta*y*(1/onep - log.mu)/phii + k.theta*(log.mu-1/twop)/phii
    ld[ind,4] <- x
    ld[ind,5] <- theta * y * (log.mu^2 - 2*log.mu/onep + 2/onep^2)/phii -
                  k.theta * (log.mu^2 - 2*log.mu/twop + 2/twop^2)/phii ## d2 logf / dp2
    ld[ind,6] <- - x/phii ## d2 logf / dphi dp
  } ## length(ind)>0

  if (work.param) { ## transform derivs to derivs wrt working
    ld[,3] <- ld[,3]*phi^2 + ld[,2]*phi
    ld[,2] <- ld[,2]*phi
    ld[,5] <- ld[,5]*dpth1^2 + ld[,4]*dpth2
    ld[,4] <- ld[,4]*dpth1
    ld[,6] <- ld[,6]*dpth1*phi
    colnames(ld)[1:6] <- c("l","rho","rho.2","th","th.2","th.rho")
  }

  if (work.param&&all.derivs&&length(ind)>0) {
    #ld <- cbind(ld,ld[,1:4]*0)
    a2 <- mu1p/(mu*phii) ## 1/(mu^p*phii)
    ld[ind,7] <- a2*(onep*a1-mu/twop)   ## deriv w.r.t mu
    ld[ind,8] <- -a2*(onep*p*a1/mu+2*onep/twop) ## 2nd deriv w.r.t. mu
    ld[ind,9] <- a2*(-log.mu*onep*a1-a1 + onep*(y/onep^2-mu/twop^2)+mu*log.mu/twop-mu/twop^2) ## mu p
    ld[ind,10] <- a2*(mu/(phii*twop) - onep*a1/phii) ## mu phi
    ## transform to working...
    ld[,10] <- ld[,10]*phi
    ld[,9] <- ld[,9]*dpth1
    colnames(ld) <- c("l","rho","rho.2","th","th.2","th.rho","mu","mu.2","mu.theta","mu.rho")
  }


  if (length(ind)>0) { 
    ld[ind,1] <- ld[ind,1] + oo$w ## log density
    ld[ind,2] <- ld[ind,2] + oo$w1   ## d log f / dphi
    ld[ind,3] <- ld[ind,3] + oo$w2 ## d2 logf / dphi2
    ld[ind,4] <- ld[ind,4] + oo$w1p 
    ld[ind,5] <- ld[ind,5] + oo$w2p  ## d2 logf / dp2
    ld[ind,6] <- ld[ind,6] + oo$w2pp ## d2 logf / dphi dp
  } 

if (FALSE) { ## DEBUG disconnection of density terms
  ld[ind,1] <-  oo$w ## log density
  ld[ind,2] <-  oo$w1   ## d log f / dphi
  ld[ind,3] <-  oo$w2 ## d2 logf / dphi2
  ld[ind,4] <-  oo$w1p 
  ld[ind,5] <-  oo$w2p  ## d2 logf / dp2
  ld[ind,6] <-  oo$w2pp ## d2 logf / dphi dp
} 

  ld
} ## ldTweedie


Tweedie <- function(p=1,link=power(0)) {
## a restricted Tweedie family
  if (p<=1||p>2) stop("Only 1<p<=2 supported")
  
  linktemp <- substitute(link)
  if (!is.character(linktemp)) linktemp <- deparse(linktemp)
  okLinks <- c("log", "identity", "sqrt","inverse")
  if (linktemp %in% okLinks)
    stats <- make.link(linktemp) else 
  if (is.character(link)) {
    stats <- make.link(link)
    linktemp <- link
  } else {
    if (inherits(link, "link-glm")) {
       stats <- link
       if (!is.null(stats$name))
          linktemp <- stats$name
        } else {
            stop(gettextf("link \"%s\" not available for Tweedie family.",
                linktemp, collapse = ""),domain = NA)
        }
    }
    
    variance <- function(mu) mu^p
    dvar <- function(mu) p*mu^(p-1)
    if (p==1) d2var <- function(mu) 0*mu else
      d2var <- function(mu) p*(p-1)*mu^(p-2)
    if (p==1||p==2)  d3var <- function(mu) 0*mu else
      d3var <- function(mu) p*(p-1)*(p-2)*mu^(p-3)
    validmu <- function(mu) all(mu >= 0)

    dev.resids <- function(y, mu, wt) {
        y1 <- y + (y == 0)
        if (p == 1)
            theta <- log(y1/mu)
        else theta <- (y1^(1 - p) - mu^(1 - p))/(1 - p)
        if (p == 2)
            kappa <- log(y1/mu)
        else kappa <- (y^(2 - p) - mu^(2 - p))/(2 - p)
        pmax(2 * wt * (y * theta - kappa),0)
    }
    initialize <- expression({
        n <- rep(1, nobs)
        mustart <- y + 0.1 * (y == 0)
    })

    ls <-  function(y,w,n,scale) {
      power <- p
      colSums(w*ldTweedie(y,y,p=power,phi=scale))
    }

    aic <- function(y, n, mu, wt, dev) {
      power <- p
      scale <- dev/sum(wt)
      -2*sum(ldTweedie(y,mu,p=power,phi=scale)[,1]*wt) + 2
    }

    if (p==2) {
      rd <- function(mu,wt,scale) {
        rgamma(mu,shape=1/scale,scale=mu*scale)
      }   
    } else {
      rd <- function(mu,wt,scale) {
        rTweedie(mu,p=p,phi=scale)
      }
    }

    structure(list(family = paste("Tweedie(",p,")",sep=""), variance = variance, 
              dev.resids = dev.resids,aic = aic, link = linktemp, linkfun = stats$linkfun, linkinv = stats$linkinv,
        mu.eta = stats$mu.eta, initialize = initialize, validmu = validmu,
        valideta = stats$valideta,dvar=dvar,d2var=d2var,d3var=d3var,ls=ls,rd=rd,canonical="none"), class = "family")


} ## Tweedie


rTweedie <- function(mu,p=1.5,phi=1) {
## generate Tweedie random variables, with 1<p<2, 
## adapted from rtweedie in the tweedie package
  if (p<=1||p>=2) stop("p must be in (1,2)")
  if (sum(mu<0)) stop("mean, mu, must be non negative")
  if (phi<=0) stop("scale parameter must be positive")
  
  lambda <- mu^(2-p)/((2-p)*phi)
  shape <- (2-p)/(p-1)
  scale <- phi*(p-1)*mu^(p-1)

  n.sim <- length(mu)

  ## how many Gamma r.v.s to sum up to get Tweedie
  ## 0 => none, and a zero value

  N <- rpois(length(lambda),lambda)

  ## following is a vector of N[i] copies of each gamma.scale[i]
  ## concatonated one after the other

  gs <- rep(scale,N)

  ## simulate gamma deviates to sum to get tweedie deviates

  y <- rgamma(gs*0+1,shape=shape,scale=gs)

  ## create summation index...

  lab <- rep(1:length(N),N)

  ## sum up each gamma sharing a label. 0 deviate if label does not occur
  o <- .C(C_psum,y=as.double(rep(0,n.sim)),as.double(y),as.integer(lab),as.integer(length(lab)))  
  o$y
} ## rTweedie