% File src/library/stats/man/xtabs.Rd
% Part of the R package, https://www.R-project.org
% Copyright 1995-2024 R Core Team
% Distributed under GPL 2 or later

\name{xtabs}
\alias{xtabs}
\alias{print.xtabs}
\title{Cross Tabulation}
\description{
  Create a contingency table (optionally a sparse matrix) from
  cross-classifying factors, usually contained in a data frame,
  using a formula interface.
}
\usage{
xtabs(formula = ~., data = parent.frame(), subset, sparse = FALSE,
      na.action, na.rm = FALSE, addNA = FALSE,
      exclude = if(!addNA) c(NA, NaN), drop.unused.levels = FALSE)

\method{print}{xtabs}(x, na.print = "", \dots)
}
\arguments{
  \item{formula}{a \link{formula} object with the cross-classifying variables
    (separated by \code{+}) on the right-hand side (or an object which
    can be coerced to a formula).  Interactions are not allowed.  On the
    left-hand side, one may optionally give a vector or a matrix of
    counts; in the latter case, the columns are interpreted as
    corresponding to the levels of a variable.  This is useful if the
    data have already been tabulated, see the examples below.}
  \item{data}{an optional matrix or data frame (or similar: see
    \code{\link{model.frame}}) containing the variables in the
    formula \code{formula}.  By default the variables are taken from
    \code{environment(formula)}.}
  \item{subset}{an optional vector specifying a subset of observations
    to be used.}
  \item{sparse}{logical specifying if the result should be a
    \emph{sparse} matrix, i.e., inheriting from
    \code{\link[Matrix:sparseMatrix-class]{sparseMatrix}}%\linkS4class{sparseMatrix}.
    Only works for two factors (since there
    are no higher-order sparse array classes yet).
  }
  \item{na.action}{a \code{\link{function}} which indicates what should
    happen when the variables in \code{formula} (or \code{subset})
    contain \code{\link{NA}}s.  Defaults to \code{\link{na.pass}},
    so \code{na.rm} and \code{addNA}, respectively, control the
    handling of missing values for the two sides of the \code{formula}.
    Using \code{\link{na.omit}} removes any incomplete cases.}
  \item{na.rm}{logical: should missing values on the left-hand side of the
    \code{formula} be treated as zero when computing the \code{\link{sum}}?}
  \item{addNA}{logical indicating if \code{NA}s in the factors should get a separate
    level and be counted, using \code{\link{addNA}(*, ifany=TRUE)}.
    This has no effect if \code{na.action = na.omit}.}
  \item{exclude}{a vector of values to be excluded when forming the
    set of levels of the classifying factors.}
  \item{drop.unused.levels}{a logical indicating whether to drop unused
    levels in the classifying factors.  If this is \code{FALSE} and
    there are unused levels, the table will contain zero marginals, and
    a subsequent chi-squared test for independence of the factors will
    not work.}

  \item{x}{an object of class \code{"xtabs"}.}
  \item{na.print}{character string (or \code{NULL}) indicating how
    \code{\link{NA}} are printed.  The default (\code{""}) does not show
    \code{NA}s clearly, and \code{na.print = "NA"} maybe advisable
    instead.}
  \item{\dots}{further arguments passed to or from other methods.}
}
\details{
  There is a \code{summary} method for contingency table objects created
  by \code{table} or \code{xtabs(*, sparse = FALSE)}, which gives basic
  information and performs a chi-squared test for independence of
  factors (note that the function \code{\link{chisq.test}} currently
  only handles 2-d tables).

  If a left-hand side is given in \code{formula}, its entries are simply
  summed over the cells corresponding to the right-hand side; this also
  works if the LHS does not give counts.

  For variables in \code{formula} which are factors, \code{exclude}
  must be specified explicitly; the default exclusions will not be used.

  In \R versions before 3.4.0, e.g., when \code{na.action = na.pass},
  sometimes zeroes (\code{0}) were returned instead of \code{NA}s.

  In \R versions before 4.4.0, when \code{!addNA} as by default,
  the default \code{na.action} was \code{\link{na.omit}}, effectively
  treating missing counts as zero.
}
\value{
  By default, when \code{sparse = FALSE},
  a contingency table in array representation of S3 class \code{c("xtabs",
    "table")}, with a \code{"call"} attribute storing the matched call.

  When \code{sparse = TRUE}, a sparse numeric matrix, specifically an
  object of S4 class %\linkS4class{dgTMatrix}
  \code{\link[Matrix:dgTMatrix-class]{dgTMatrix}} from package
  \CRANpkg{Matrix}.
}
\seealso{
  \code{\link{table}} for traditional cross-tabulation, and
  \code{\link{as.data.frame.table}} which is the inverse operation of
  \code{xtabs} (see the \code{DF} example below).

  \code{\link[Matrix:sparseMatrix-class]{sparseMatrix}} on sparse
  matrices in package \CRANpkg{Matrix}.
}
\examples{
## 'esoph' has the frequencies of cases and controls for all levels of
## the variables 'agegp', 'alcgp', and 'tobgp'.
xtabs(cbind(ncases, ncontrols) ~ ., data = esoph)
## Output is not really helpful ... flat tables are better:
ftable(xtabs(cbind(ncases, ncontrols) ~ ., data = esoph))
## In particular if we have fewer factors ...
ftable(xtabs(cbind(ncases, ncontrols) ~ agegp, data = esoph))

## This is already a contingency table in array form.
DF <- as.data.frame(UCBAdmissions)
## Now 'DF' is a data frame with a grid of the factors and the counts
## in variable 'Freq'.
DF
## Nice for taking margins ...
xtabs(Freq ~ Gender + Admit, DF)
## And for testing independence ...
summary(xtabs(Freq ~ ., DF))

## with NA's
DN <- DF; DN[cbind(6:9, c(1:2,4,1))] <- NA
DN # 'Freq' is missing only for (Rejected, Female, B)
(xtNA <- xtabs(Freq ~ Gender + Admit, DN))     # NA prints 'invisibly'
print(xtNA, na.print = "NA")                   # show NA's better
xtabs(Freq ~ Gender + Admit, DN, na.rm = TRUE) # ignore missing Freq
## Use addNA = TRUE to tabulate missing factor levels:
xtabs(Freq ~ Gender + Admit, DN, addNA = TRUE)
xtabs(Freq ~ Gender + Admit, DN, addNA = TRUE, na.rm = TRUE)
## na.action = na.omit removes all rows with NAs right from the start:
xtabs(Freq ~ Gender + Admit, DN, na.action = na.omit)

## Create a nice display for the warp break data.
warpbreaks$replicate <- rep_len(1:9, 54)
ftable(xtabs(breaks ~ wool + tension + replicate, data = warpbreaks))

### ---- Sparse Examples ----

\donttest{if(require("Matrix")) withAutoprint({
 ## similar to "nlme"s  'ergoStool' :
 d.ergo <- data.frame(Type = paste0("T", rep(1:4, 9*4)),
                      Subj = gl(9, 4, 36*4))
 xtabs(~ Type + Subj, data = d.ergo) # 4 replicates each
 set.seed(15) # a subset of cases:
 xtabs(~ Type + Subj, data = d.ergo[sample(36, 10), ], sparse = TRUE)

 ## Hypothetical two-level setup:
 inner <- factor(sample(letters[1:25], 100, replace = TRUE))
 inout <- factor(sample(LETTERS[1:5], 25, replace = TRUE))
 fr <- data.frame(inner = inner, outer = inout[as.integer(inner)])
 xtabs(~ inner + outer, fr, sparse = TRUE)
})}% only if Matrix is available
}
\keyword{category}