% File src/library/stats/man/family.Rd
% Part of the R package, https://www.R-project.org
% Copyright 1995-2023 R Core Team
% Distributed under GPL 2 or later

\name{family}
\alias{family}
\alias{binomial}
\alias{gaussian}
\alias{Gamma}
\alias{inverse.gaussian}
\alias{poisson}
\alias{quasi}
\alias{quasibinomial}
\alias{quasipoisson}
%\alias{print.family}

\title{Family Objects for Models}
\usage{
family(object, \dots)

binomial(link = "logit")
gaussian(link = "identity")
Gamma(link = "inverse")
inverse.gaussian(link = "1/mu^2")
poisson(link = "log")
quasi(link = "identity", variance = "constant")
quasibinomial(link = "logit")
quasipoisson(link = "log")
}
\arguments{
  \item{link}{a specification for the model link function.  This can be
    a name/expression, a literal character string, a length-one character
    vector, or an object of class
    \code{"\link[=make.link]{link-glm}"} (such as generated by
    \code{\link{make.link}}) provided it is not specified
    \emph{via} one of the standard names given next.

    The \code{gaussian} family accepts the links (as names)
    \code{identity}, \code{log} and \code{inverse};
    the \code{binomial} family the links \code{logit},
    \code{probit}, \code{cauchit}, (corresponding to logistic,
    normal and Cauchy CDFs respectively) \code{log} and
    \code{cloglog} (complementary log-log);
    the \code{Gamma} family the links \code{inverse}, \code{identity}
     and \code{log};
    the \code{poisson} family the links \code{log}, \code{identity},
    and \code{sqrt}; and the \code{inverse.gaussian} family the links
    \code{1/mu^2}, \code{inverse}, \code{identity}
    and \code{log}.

    The \code{quasi} family accepts the links \code{logit}, \code{probit},
    \code{cloglog},  \code{identity}, \code{inverse},
    \code{log}, \code{1/mu^2} and \code{sqrt}, and
    the function \code{\link{power}} can be used to create a
    power link function.
  }
  \item{variance}{for all families other than \code{quasi}, the variance
    function is determined by the family.  The \code{quasi} family will
    accept the literal character string (or unquoted as a name/expression)
    specifications \code{"constant"}, \code{"mu(1-mu)"}, \code{"mu"},
    \code{"mu^2"} and \code{"mu^3"}, a length-one character vector
    taking one of those values, or a list containing components
    \code{varfun}, \code{validmu}, \code{dev.resids}, \code{initialize}
    and \code{name}.
  }
  \item{object}{the function \code{family} accesses the \code{family}
    objects which are stored within objects created by modelling
    functions (e.g., \code{glm}).}
  \item{\dots}{further arguments passed to methods.}
}
\description{
  Family objects provide a convenient way to specify the details of the
  models used by functions such as \code{\link{glm}}.  See the
  documentation for \code{\link{glm}} for the details on how such model
  fitting takes place.
}
\details{
  \code{family} is a generic function with methods for classes
  \code{"glm"} and \code{"lm"} (the latter returning \code{gaussian()}).


  For the \code{binomial} and \code{quasibinomial} families the response
  can be specified in one of three ways:
  \enumerate{
    \item As a factor: \sQuote{success} is interpreted as the factor not
    having the first level (and hence usually of having the second level).
    \item As a numerical vector with values  between \code{0} and
    \code{1}, interpreted as the proportion of successful cases (with the
    total number of cases given by the \code{weights}).
    \item As a two-column integer matrix: the first column gives the
    number of successes and the second the number of failures.
  }

  The \code{quasibinomial} and \code{quasipoisson} families differ from
  the \code{binomial} and \code{poisson} families only in that the
  dispersion parameter is not fixed at one, so they can model
  over-dispersion.  For the binomial case see
  \bibcite{McCullagh and Nelder (1989, pp.\sspace{}124--8)}.
  Although they show that there is (under some
  restrictions) a model with
  variance proportional to mean as in the quasi-binomial model, note
  that \code{glm} does not compute maximum-likelihood estimates in that
  model.  The behaviour of S is closer to the quasi- variants.
}
\note{
  The \code{link} and \code{variance} arguments have rather awkward
  semantics for back-compatibility.  The recommended way is to supply
  them as quoted character strings, but they can also be supplied
  unquoted (as names or expressions).  Additionally, they can be
  supplied as a length-one character vector giving the name of one of
  the options, or as a list (for \code{link}, of class
  \code{"link-glm"}).  The restrictions apply only to links given as
  names: when given as a character string all the links known to
  \code{\link{make.link}} are accepted.

  This is potentially ambiguous: supplying \code{link = logit} could mean
  the unquoted name of a link or the value of object \code{logit}.  It
  is interpreted if possible as the name of an allowed link, then
  as an object.  (You can force the interpretation to always be the value of
  an object via \code{logit[1]}.)
}
\value{
  An object of class \code{"family"} (which has a concise print method).
  This is a list with elements
  \item{family}{character: the family name.}
  \item{link}{character: the link name.}
  \item{linkfun}{function: the link.}
  \item{linkinv}{function: the inverse of the link function.}
  \item{variance}{function: the variance as a function of the mean.}
  \item{dev.resids}{function giving the deviance for each observation
    as a function of \code{(y, mu, wt)}, used by the
    \code{\link[=residuals.glm]{residuals}} method when computing
    deviance residuals.}
    \item{aic}{function giving the AIC value if appropriate (but \code{NA}
    for the quasi- families).  More precisely, this function
    returns \eqn{-2\ell + 2 s}{-2 ll + 2 s}, where \eqn{\ell}{ll} is the
    log-likelihood and \eqn{s} is the number of estimated scale
    parameters.  Note that the penalty term for the location parameters
    (typically the \dQuote{regression coefficients}) is added elsewhere,
    e.g., in \code{\link{glm.fit}()}, or \code{\link{AIC}()}, see the
    AIC example in \code{\link{glm}}.
    See \code{\link{logLik}} for the assumptions made about the
    dispersion parameter.}
  \item{mu.eta}{function: derivative of the inverse-link function
    with respect to the linear predictor.  If the inverse-link
    function is \eqn{\mu = g^{-1}(\eta)}{mu = ginv(eta)} where
    \eqn{\eta}{eta} is the value of the linear predictor, then this
    function returns
    \eqn{d(g^{-1})/d\eta = d\mu/d\eta}{d(ginv(eta))/d(eta) = d(mu)/d(eta)}.}
  \item{initialize}{expression.  This needs to set up whatever data
    objects are needed for the family as well as \code{n} (needed for
    AIC in the binomial family) and \code{mustart} (see \code{\link{glm}}).}
  \item{validmu}{logical function.  Returns \code{TRUE} if a mean
    vector \code{mu} is within the domain of \code{variance}.}
  \item{valideta}{logical function.   Returns \code{TRUE} if a linear
    predictor \code{eta} is within the domain of \code{linkinv}.}
  \item{simulate}{(optional) function \code{simulate(object, nsim)} to be
    called by the \code{"lm"} method of \code{\link{simulate}}.  It will
    normally return a matrix with \code{nsim} columns and one row for
    each fitted value, but it can also return a list of length
    \code{nsim}. Clearly this will be missing for \sQuote{quasi-}
    families.}
  \item{dispersion}{(optional since \R version 4.3.0) numeric: value of the
    dispersion parameter, if fixed, or \code{NA_real_} if free.}
}
\references{
  McCullagh P. and Nelder, J. A. (1989)
  \emph{Generalized Linear Models.}
  London: Chapman and Hall.

  Dobson, A. J. (1983)
  \emph{An Introduction to Statistical Modelling.}
  London: Chapman and Hall.

  Cox, D. R. and  Snell, E. J. (1981).
  \emph{Applied Statistics; Principles and Examples.}
  London: Chapman and Hall.

  Hastie, T. J. and Pregibon, D. (1992)
  \emph{Generalized linear models.}
  Chapter 6 of \emph{Statistical Models in S}
  eds J. M. Chambers and T. J. Hastie, Wadsworth & Brooks/Cole.
}
\author{
  The design was inspired by S functions of the same names described
  in Hastie & Pregibon (1992) (except \code{quasibinomial} and
  \code{quasipoisson}).
}
\seealso{
  \code{\link{glm}}, \code{\link{power}}, \code{\link{make.link}}.

  For binomial \emph{coefficients}, \code{\link{choose}};
  the binomial and negative binomial \emph{distributions},
  \code{\link{Binomial}}, and \code{\link{NegBinomial}}.
}
\examples{
require(utils) # for str

nf <- gaussian()  # Normal family
nf
str(nf)

gf <- Gamma()
gf
str(gf)
gf$linkinv
gf$variance(-3:4) #- == (.)^2

## Binomial with default 'logit' link:  Check some properties visually:
bi <- binomial()
et <- seq(-10,10, by=1/8)
plot(et, bi$mu.eta(et), type="l")
## show that mu.eta() is derivative of linkinv() :
lines((et[-1]+et[-length(et)])/2, col=adjustcolor("red", 1/4),
      diff(bi$linkinv(et))/diff(et), type="l", lwd=4)
## which here is the logistic density:
lines(et, dlogis(et), lwd=3, col=adjustcolor("blue", 1/4))
stopifnot(exprs = {
  all.equal(bi$ mu.eta(et), dlogis(et))
  all.equal(bi$linkinv(et), plogis(et) -> m)
  all.equal(bi$linkfun(m ), qlogis(m))    #  logit(.) == qlogis(.) !
})

## Data from example(glm) :
d.AD <- data.frame(treatment = gl(3,3),
                   outcome   = gl(3,1,9),
                   counts    = c(18,17,15, 20,10,20, 25,13,12))
glm.D93 <- glm(counts ~ outcome + treatment, d.AD, family = poisson())
## Quasipoisson: compare with above / example(glm) :
glm.qD93 <- glm(counts ~ outcome + treatment, d.AD, family = quasipoisson())
\donttest{
glm.qD93
anova  (glm.qD93, test = "F")
summary(glm.qD93)
## for Poisson results (same as from 'glm.D93' !) use
anova  (glm.qD93, dispersion = 1, test = "Chisq")
summary(glm.qD93, dispersion = 1)
}


## Example of user-specified link, a logit model for p^days
## See Shaffer, T.  2004. Auk 121(2): 526-540.
logexp <- function(days = 1)
{
    linkfun <- function(mu) qlogis(mu^(1/days))
    linkinv <- function(eta) plogis(eta)^days
    mu.eta  <- function(eta) days * plogis(eta)^(days-1) *
                  binomial()$mu.eta(eta)
    valideta <- function(eta) TRUE
    link <- paste0("logexp(", days, ")")
    structure(list(linkfun = linkfun, linkinv = linkinv,
                   mu.eta = mu.eta, valideta = valideta, name = link),
              class = "link-glm")
}
(bil3 <- binomial(logexp(3)))
\dontshow{stopifnot(length(bil3$mu.eta(as.double(0:5))) == 6)}
## in practice this would be used with a vector of 'days', in
## which case use an offset of 0 in the corresponding formula
## to get the null deviance right.

## Binomial with identity link: often not a good idea, as both
## computationally and conceptually difficult:
binomial(link = "identity")  ## is exactly the same as
binomial(link = make.link("identity"))


## tests of quasi
x <- rnorm(100)
y <- rpois(100, exp(1+x))
glm(y ~ x, family = quasi(variance = "mu", link = "log"))
# which is the same as
glm(y ~ x, family = poisson)
glm(y ~ x, family = quasi(variance = "mu^2", link = "log"))
\dontrun{glm(y ~ x, family = quasi(variance = "mu^3", link = "log")) # fails}
y <- rbinom(100, 1, plogis(x))
# need to set a starting value for the next fit
glm(y ~ x, family = quasi(variance = "mu(1-mu)", link = "logit"), start = c(0,1))
}
\keyword{models}